huggingface · ArthurZucker · Feb 20, 2026 · Feb 16, 2026 · Feb 16, 2026 · Feb 20, 2026
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
@@ -418,18 +418,7 @@ def _decode(
             else self.clean_up_tokenization_spaces
         )
         if clean_up_tokenization_spaces:
-            text = (
-                text.replace(" .", ".")
-                .replace(" ?", "?")
-                .replace(" !", "!")
-                .replace(" ,", ",")
-                .replace(" ' ", "'")
-                .replace(" n't", "n't")
-                .replace(" 'm", "'m")
-                .replace(" 's", "'s")
-                .replace(" 've", "'ve")
-                .replace(" 're", "'re")
-            )
+            text = self.clean_up_tokenization(text)
 
         return text
 

diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py
@@ -334,30 +334,6 @@ def _convert_lang_code_special_format(self, lang: str) -> str:
         lang = FAIRSEQ_LANGUAGE_CODES_MAP.get(lang, lang)
         return lang
 
-    def clean_up_tokenization(self, out_string: str) -> str:
-        """
-        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
-
-        Args:
-            out_string (`str`): The text to clean up.
-
-        Returns:
-            `str`: The cleaned-up string.
-        """
-        out_string = (
-            out_string.replace(" .", ".")
-            .replace(" ?", "?")
-            .replace(" !", "!")
-            .replace(" ,", ",")
-            .replace(" ' ", "'")
-            .replace(" n't", "n't")
-            .replace(" 'm", "'m")
-            .replace(" 's", "'s")
-            .replace(" 've", "'ve")
-            .replace(" 're", "'re")
-        )
-        return out_string
-
     def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=None, **kwargs):
         """Override to use self.clean_up_tokenization_spaces as default for batched input."""
         return super().decode(

diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -356,31 +356,6 @@ def convert_tokens_to_string(
 
         return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
 
-    @staticmethod
-    def clean_up_tokenization(out_string: str) -> str:
-        """
-        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
-
-        Args:
-            out_string (`str`): The text to clean up.
-
-        Returns:
-            `str`: The cleaned-up string.
-        """
-        out_string = (
-            out_string.replace(" .", ".")
-            .replace(" ?", "?")
-            .replace(" !", "!")
-            .replace(" ,", ",")
-            .replace(" ' ", "'")
-            .replace(" n't", "n't")
-            .replace(" 'm", "'m")
-            .replace(" 's", "'s")
-            .replace(" 've", "'ve")
-            .replace(" 're", "'re")
-        )
-        return out_string
-
     @staticmethod
     def _compute_offsets(char_repetitions: list[int], chars: list[str], ctc_token: int) -> list[dict[str, str | int]]:
         end_indices = np.asarray(char_repetitions).cumsum()

diff --git a/src/transformers/tokenization_mistral_common.py b/src/transformers/tokenization_mistral_common.py
@@ -455,23 +455,7 @@ def _decode(
             else self.clean_up_tokenization_spaces
         )
         if clean_up_tokenization_spaces:
-            # Call custom cleanup method if it exists (e.g., for CLVP's [SPACE] token replacement)
-            if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization):
-                text = self.clean_up_tokenization(text)
-            else:
-                # Otherwise apply standard cleanup
-                text = (
-                    text.replace(" .", ".")
-                    .replace(" ?", "?")
-                    .replace(" !", "!")
-                    .replace(" ,", ",")
-                    .replace(" ' ", "'")
-                    .replace(" n't", "n't")
-                    .replace(" 'm", "'m")
-                    .replace(" 's", "'s")
-                    .replace(" 've", "'ve")
-                    .replace(" 're", "'re")
-                )
+            text = self.clean_up_tokenization(text)
 
         return _maybe_remove_lang(text=text, skip_special_tokens=skip_special_tokens)
 

diff --git a/src/transformers/tokenization_python.py b/src/transformers/tokenization_python.py
@@ -1108,23 +1108,7 @@ def _decode(
             else self.clean_up_tokenization_spaces
         )
         if clean_up_tokenization_spaces:
-            # Call custom cleanup method if it exists (e.g., for CLVP's [SPACE] token replacement)
-            if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization):
-                text = self.clean_up_tokenization(text)
-            else:
-                # Otherwise apply standard cleanup
-                text = (
-                    text.replace(" .", ".")
-                    .replace(" ?", "?")
-                    .replace(" !", "!")
-                    .replace(" ,", ",")
-                    .replace(" ' ", "'")
-                    .replace(" n't", "n't")
-                    .replace(" 'm", "'m")
-                    .replace(" 's", "'s")
-                    .replace(" 've", "'ve")
-                    .replace(" 're", "'re")
-                )
+            text = self.clean_up_tokenization(text)
 
         return text
 

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -2134,6 +2134,27 @@ def _save_pretrained(
 
         return file_names + vocab_files + (added_tokens_file,)
 
+    def clean_up_tokenization(self, text: str) -> str:
+        """
+        Clean up tokenization spaces in a given text.
+        This method is mostly for remote code support.
+
+        """
+
+        text = (
+            text.replace(" .", ".")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ,", ",")
+            .replace(" ' ", "'")
+            .replace(" n't", "n't")
+            .replace(" 'm", "'m")
+            .replace(" 's", "'s")
+            .replace(" 've", "'ve")
+            .replace(" 're", "'re")
+        )
+        return text
+
     def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]:
         """
         Save only the vocabulary of the tokenizer (vocabulary + added tokens).

diff --git a/src/transformers/tokenization_utils_tokenizers.py b/src/transformers/tokenization_utils_tokenizers.py
@@ -939,23 +939,7 @@ def _decode(
             else self.clean_up_tokenization_spaces
         )
         if clean_up_tokenization_spaces:
-            # Call custom cleanup method if it exists
-            if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization):
-                text = self.clean_up_tokenization(text)
-            else:
-                # Apply standard cleanup
-                text = (
-                    text.replace(" .", ".")
-                    .replace(" ?", "?")
-                    .replace(" !", "!")
-                    .replace(" ,", ",")
-                    .replace(" ' ", "'")
-                    .replace(" n't", "n't")
-                    .replace(" 'm", "'m")
-                    .replace(" 's", "'s")
-                    .replace(" 've", "'ve")
-                    .replace(" 're", "'re")
-                )
+            text = self.clean_up_tokenization(text)
 
         return text