diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index 78e1aa58ef04..fa6666770ca1 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -129,6 +129,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index 606aadc1eab4..c312d7c44763 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -177,6 +177,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 2d699c8f663a..0a73406729a5 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -159,6 +159,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index 9552d323ac57..5c814742c9ac 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -16,6 +16,7 @@ Processor class for Donut. """ +import logging import re import warnings from contextlib import contextmanager @@ -24,12 +25,16 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils.deprecation import deprecate_kwarg class DonutProcessorKwargs(ProcessingKwargs, total=False): _defaults = {} +logger = logging.getLogger(__name__) + + class DonutProcessor(ProcessorMixin): r""" Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single @@ -70,6 +75,7 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs): self.current_processor = self.image_processor self._in_target_context_manager = False + @deprecate_kwarg(old_name="legacy", version="5.0.0") def __call__( self, images: ImageInput = None, @@ -85,6 +91,14 @@ def __call__( [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information. """ # For backward compatibility + legacy = kwargs.pop("legacy", True) + if legacy: + logger.warning_once( + "Legacy behavior is being used. The new behavior with legacy=False will be enabled in the future." + "In the new behavior, if both images and text are provided, the default value of `add_special_tokens` " + "will be changed to `False` when calling the tokenizer if `add_special_tokens` is unset." + ) + if self._in_target_context_manager: return self.current_processor(images, text, **kwargs) @@ -99,7 +113,11 @@ def __call__( if images is not None: inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) - if text is not None: + if text is not None and images is None: + encodings = self.tokenizer(text, **output_kwargs["text_kwargs"]) + elif text is not None: + if not legacy: + output_kwargs["text_kwargs"].setdefault("add_special_tokens", False) encodings = self.tokenizer(text, **output_kwargs["text_kwargs"]) if text is None: @@ -195,6 +213,20 @@ def token2json(self, tokens, is_inner_value=False, added_vocab=None): else: return [] if is_inner_value else {"text_sequence": tokens} + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def feature_extractor_class(self): warnings.warn( diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py index ff7d2c547dc4..5c7740b36aef 100644 --- a/src/transformers/models/fuyu/processing_fuyu.py +++ b/src/transformers/models/fuyu/processing_fuyu.py @@ -264,10 +264,10 @@ def _tokenize_prompts_with_image_and_batch( bos_token = tokenizer.vocab["|ENDOFTEXT|"] prompts_tokens = [[[bos_token] + x for x in prompt_seq] for prompt_seq in prompts_tokens] if add_beginning_of_answer_token: - boa = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING] + beginning_of_answer = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING] # Only add bbox open token to the last subsequence since that is what will be completed for token_seq in prompts_tokens: - token_seq[-1].append(boa) + token_seq[-1].append(beginning_of_answer) # Now we have a list of list of tokens which each list has a different # size. We want to extend this list to: @@ -682,6 +682,32 @@ def tokens_to_points(tokens, original_size): return results + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-processes the output of `FuyuForConditionalGeneration` to only return the text output. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + containing the token ids of the generated sequences. + + Returns: + `List[str]`: The decoded text output. + """ + beginning_of_answer = self.tokenizer.vocab[BEGINNING_OF_ANSWER_STRING] + # get boa index for each outputted sequence tensor + # start all generated sequences from the beginning of the answer token, pad to have consistent length + unpadded_output_sequences = [ + seq[(seq == beginning_of_answer).nonzero(as_tuple=True)[0] + 1 :] for seq in generated_outputs + ] + max_len = max(len(seq) for seq in unpadded_output_sequences) + # convert to torch and pad sequences + padded_output_sequences = torch.full((len(unpadded_output_sequences), max_len), self.pad_token_id) + for i, seq in enumerate(unpadded_output_sequences): + padded_output_sequences[i, : len(seq)] = torch.tensor(seq) + + return self.batch_decode(padded_output_sequences, skip_special_tokens=True) + def batch_decode(self, *args, **kwargs): """ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py index 3744d81a0aca..7b2fc6e7da98 100644 --- a/src/transformers/models/git/processing_git.py +++ b/src/transformers/models/git/processing_git.py @@ -16,18 +16,23 @@ Image/Text processor class for GIT """ +import logging from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils.deprecation import deprecate_kwarg class GitProcessorKwargs(ProcessingKwargs, total=False): _defaults = {} +logger = logging.getLogger(__name__) + + class GitProcessor(ProcessorMixin): r""" Constructs a GIT processor which wraps a CLIP image processor and a BERT tokenizer into a single processor. @@ -50,6 +55,7 @@ def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) self.current_processor = self.image_processor + @deprecate_kwarg(old_name="legacy", version="5.0.0") def __call__( self, images: Optional[ImageInput] = None, @@ -91,6 +97,14 @@ def __call__( `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + legacy = kwargs.pop("legacy", True) + if legacy: + logger.warning( + "Legacy behavior is being used. The new behavior with legacy=False will be enabled in the future." + "In the new behavior, if both images and text are provided, the last token (EOS token) " + "of the input_ids and attention_mask tensors will be removed." + ) + if text is None and images is None: raise ValueError("You have to specify either text or images. Both cannot be none.") @@ -110,6 +124,10 @@ def __call__( if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) data.update(image_features) + if not legacy: + data["input_ids"] = data["input_ids"][:, :-1] + data["attention_mask"] = data["attention_mask"][:, :-1] + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): @@ -126,6 +144,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def model_input_names(self): return ["input_ids", "attention_mask", "pixel_values"] diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 3406ab2226e0..6863e6a34ed4 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -530,6 +530,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index 68566d182678..da258cb50210 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -255,6 +255,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index ceafa26a8b11..f5c23e256049 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -328,6 +328,20 @@ def decode(self, *args, **kwargs): decode_output = self.tokenizer.decode(*args, **kwargs) return self._regex_to_remove_extra_special_tokens.sub("", decode_output) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index dc6c9deaf177..53ab4639dcbc 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -176,6 +176,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py index 76108789718b..d7befd899f3a 100644 --- a/src/transformers/models/kosmos2/processing_kosmos2.py +++ b/src/transformers/models/kosmos2/processing_kosmos2.py @@ -428,6 +428,21 @@ def post_process_generation(self, text, cleanup_and_extract=True): return clean_text_and_extract_entities_with_bboxes(caption) return caption + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + generated_texts = self.batch_decode(generated_outputs, skip_special_tokens=True) + return [self.post_process_generation(text, cleanup_and_extract=False) for text in generated_texts] + @property # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 8a9597892c60..d3fa5938af0c 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -182,6 +182,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index ce11be6d6309..81b5d124297f 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -222,6 +222,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index 039e05a7ec19..070fa033bf63 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -261,6 +261,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py index eb092f021f63..109509419ac7 100644 --- a/src/transformers/models/mllama/processing_mllama.py +++ b/src/transformers/models/mllama/processing_mllama.py @@ -342,6 +342,22 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode( + generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py index 6bc95dc2fb8c..fa6594398a23 100644 --- a/src/transformers/models/paligemma/processing_paligemma.py +++ b/src/transformers/models/paligemma/processing_paligemma.py @@ -17,6 +17,7 @@ """ import logging +import warnings from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature @@ -96,6 +97,12 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_i image_token (`str`): The image token. num_images (`int`): Number of images in the prompt. """ + if image_token in prompt: + warnings.warn( + f"The image token {image_token} is already present in the prompt. No need to manually add {image_token} in the prompt for this model." + f" Removing all {image_token} and adding ({image_token}) * image_seq_len * num_images at the start of the prompt." + ) + prompt = prompt.replace(image_token, "") return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n" @@ -339,6 +346,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->PaliGemma def model_input_names(self): diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py index de8c594f94c9..31920aa26fe4 100644 --- a/src/transformers/models/pix2struct/processing_pix2struct.py +++ b/src/transformers/models/pix2struct/processing_pix2struct.py @@ -16,11 +16,13 @@ Processor class for Pix2Struct. """ +import logging from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput +from ...utils.deprecation import deprecate_kwarg class Pix2StructImagesKwargs(ImagesKwargs, total=False): @@ -48,6 +50,9 @@ class Pix2StructProcessorKwargs(ProcessingKwargs, total=False): } +logger = logging.getLogger(__name__) + + class Pix2StructProcessor(ProcessorMixin): r""" Constructs a PIX2STRUCT processor which wraps a BERT tokenizer and PIX2STRUCT image processor into a single @@ -71,6 +76,7 @@ def __init__(self, image_processor, tokenizer): tokenizer.return_token_type_ids = False super().__init__(image_processor, tokenizer) + @deprecate_kwarg(old_name="legacy", version="5.0.0") def __call__( self, images=None, @@ -85,6 +91,14 @@ def __call__( Please refer to the docstring of the above two methods for more information. """ + legacy = kwargs.pop("legacy", True) + if legacy: + logger.warning( + "Legacy behavior is being used. The new behavior with legacy=False will be enabled in the future." + "In the new behavior, If both images and text are provided, image_processor is not a VQA processor, and `add_special_tokens` is unset, " + "the default value of `add_special_tokens` will be changed to `False` when calling the tokenizer." + ) + if images is None and text is None: raise ValueError("You have to specify either images or text.") @@ -93,8 +107,12 @@ def __call__( tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) + add_special_tokens = output_kwargs["text_kwargs"].pop("add_special_tokens", None) # Get only text if images is None and not self.image_processor.is_vqa: + output_kwargs["text_kwargs"]["add_special_tokens"] = ( + add_special_tokens if add_special_tokens is not None else True + ) self.current_processor = self.tokenizer text_encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) return text_encoding @@ -108,6 +126,9 @@ def __call__( encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"]) if text is not None and not self.image_processor.is_vqa: + output_kwargs["text_kwargs"]["add_special_tokens"] = ( + add_special_tokens if add_special_tokens is not None else legacy + ) text_encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) if "attention_mask" in text_encoding: @@ -136,6 +157,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py index 70d28fb7b79c..7661da92e68b 100644 --- a/src/transformers/models/pixtral/processing_pixtral.py +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -274,6 +274,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py index 6c0e8d98014e..b453b4078c7e 100644 --- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py @@ -168,6 +168,22 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode( + generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names