diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py index 04a63e8f159b..bed415c79291 100644 --- a/src/transformers/models/luke/tokenization_luke.py +++ b/src/transformers/models/luke/tokenization_luke.py @@ -418,18 +418,7 @@ def _decode( else self.clean_up_tokenization_spaces ) if clean_up_tokenization_spaces: - text = ( - text.replace(" .", ".") - .replace(" ?", "?") - .replace(" !", "!") - .replace(" ,", ",") - .replace(" ' ", "'") - .replace(" n't", "n't") - .replace(" 'm", "'m") - .replace(" 's", "'s") - .replace(" 've", "'ve") - .replace(" 're", "'re") - ) + text = self.clean_up_tokenization(text) return text diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py index 6020f13bfd59..f848e25c24c7 100644 --- a/src/transformers/models/plbart/tokenization_plbart.py +++ b/src/transformers/models/plbart/tokenization_plbart.py @@ -334,30 +334,6 @@ def _convert_lang_code_special_format(self, lang: str) -> str: lang = FAIRSEQ_LANGUAGE_CODES_MAP.get(lang, lang) return lang - def clean_up_tokenization(self, out_string: str) -> str: - """ - Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms. - - Args: - out_string (`str`): The text to clean up. - - Returns: - `str`: The cleaned-up string. - """ - out_string = ( - out_string.replace(" .", ".") - .replace(" ?", "?") - .replace(" !", "!") - .replace(" ,", ",") - .replace(" ' ", "'") - .replace(" n't", "n't") - .replace(" 'm", "'m") - .replace(" 's", "'s") - .replace(" 've", "'ve") - .replace(" 're", "'re") - ) - return out_string - def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=None, **kwargs): """Override to use self.clean_up_tokenization_spaces as default for batched input.""" return super().decode( diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index 821b4a5b9b26..e529d576edd1 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -356,31 +356,6 @@ def convert_tokens_to_string( return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets} - @staticmethod - def clean_up_tokenization(out_string: str) -> str: - """ - Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms. - - Args: - out_string (`str`): The text to clean up. - - Returns: - `str`: The cleaned-up string. - """ - out_string = ( - out_string.replace(" .", ".") - .replace(" ?", "?") - .replace(" !", "!") - .replace(" ,", ",") - .replace(" ' ", "'") - .replace(" n't", "n't") - .replace(" 'm", "'m") - .replace(" 's", "'s") - .replace(" 've", "'ve") - .replace(" 're", "'re") - ) - return out_string - @staticmethod def _compute_offsets(char_repetitions: list[int], chars: list[str], ctc_token: int) -> list[dict[str, str | int]]: end_indices = np.asarray(char_repetitions).cumsum() diff --git a/src/transformers/tokenization_mistral_common.py b/src/transformers/tokenization_mistral_common.py index a29ad5c59587..e44b8436bddf 100644 --- a/src/transformers/tokenization_mistral_common.py +++ b/src/transformers/tokenization_mistral_common.py @@ -455,23 +455,7 @@ def _decode( else self.clean_up_tokenization_spaces ) if clean_up_tokenization_spaces: - # Call custom cleanup method if it exists (e.g., for CLVP's [SPACE] token replacement) - if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization): - text = self.clean_up_tokenization(text) - else: - # Otherwise apply standard cleanup - text = ( - text.replace(" .", ".") - .replace(" ?", "?") - .replace(" !", "!") - .replace(" ,", ",") - .replace(" ' ", "'") - .replace(" n't", "n't") - .replace(" 'm", "'m") - .replace(" 's", "'s") - .replace(" 've", "'ve") - .replace(" 're", "'re") - ) + text = self.clean_up_tokenization(text) return _maybe_remove_lang(text=text, skip_special_tokens=skip_special_tokens) diff --git a/src/transformers/tokenization_python.py b/src/transformers/tokenization_python.py index 9f8702f5b2a1..5254dd89ecd2 100644 --- a/src/transformers/tokenization_python.py +++ b/src/transformers/tokenization_python.py @@ -1108,23 +1108,7 @@ def _decode( else self.clean_up_tokenization_spaces ) if clean_up_tokenization_spaces: - # Call custom cleanup method if it exists (e.g., for CLVP's [SPACE] token replacement) - if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization): - text = self.clean_up_tokenization(text) - else: - # Otherwise apply standard cleanup - text = ( - text.replace(" .", ".") - .replace(" ?", "?") - .replace(" !", "!") - .replace(" ,", ",") - .replace(" ' ", "'") - .replace(" n't", "n't") - .replace(" 'm", "'m") - .replace(" 's", "'s") - .replace(" 've", "'ve") - .replace(" 're", "'re") - ) + text = self.clean_up_tokenization(text) return text diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 89433159b183..1a9cc2447c7a 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2134,6 +2134,27 @@ def _save_pretrained( return file_names + vocab_files + (added_tokens_file,) + def clean_up_tokenization(self, text: str) -> str: + """ + Clean up tokenization spaces in a given text. + This method is mostly for remote code support. + + """ + + text = ( + text.replace(" .", ".") + .replace(" ?", "?") + .replace(" !", "!") + .replace(" ,", ",") + .replace(" ' ", "'") + .replace(" n't", "n't") + .replace(" 'm", "'m") + .replace(" 's", "'s") + .replace(" 've", "'ve") + .replace(" 're", "'re") + ) + return text + def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]: """ Save only the vocabulary of the tokenizer (vocabulary + added tokens). diff --git a/src/transformers/tokenization_utils_tokenizers.py b/src/transformers/tokenization_utils_tokenizers.py index 446b7ebd989c..47b4e91559b9 100644 --- a/src/transformers/tokenization_utils_tokenizers.py +++ b/src/transformers/tokenization_utils_tokenizers.py @@ -939,23 +939,7 @@ def _decode( else self.clean_up_tokenization_spaces ) if clean_up_tokenization_spaces: - # Call custom cleanup method if it exists - if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization): - text = self.clean_up_tokenization(text) - else: - # Apply standard cleanup - text = ( - text.replace(" .", ".") - .replace(" ?", "?") - .replace(" !", "!") - .replace(" ,", ",") - .replace(" ' ", "'") - .replace(" n't", "n't") - .replace(" 'm", "'m") - .replace(" 's", "'s") - .replace(" 've", "'ve") - .replace(" 're", "'re") - ) + text = self.clean_up_tokenization(text) return text