huggingface · aijadugar · Oct 24, 2025 · Oct 24, 2025 · Oct 25, 2025 · Oct 25, 2025
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
@@ -654,30 +654,16 @@ def to_dict(self) -> dict[str, Any]:
         Returns:
             `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
         """
-        output = copy.deepcopy(self.__dict__)
+        # shallow copy to avoid deepcopy errors
+        output = self.__dict__.copy()  
 
-        # Get the kwargs in `__init__`.
         sig = inspect.signature(self.__init__)
-        # Only save the attributes that are presented in the kwargs of `__init__`.
-        # or in the attributes
-        attrs_to_save = list(sig.parameters) + self.__class__.attributes
-        # extra attributes to be kept
-        attrs_to_save += ["auto_map"]
-
-        if "tokenizer" in output:
-            del output["tokenizer"]
-        if "qformer_tokenizer" in output:
-            del output["qformer_tokenizer"]
-        if "protein_tokenizer" in output:
-            del output["protein_tokenizer"]
-        if "char_tokenizer" in output:
-            del output["char_tokenizer"]
-        if "chat_template" in output:
-            del output["chat_template"]
+        attrs_to_save = list(sig.parameters) + self.__class__.attributes + ["auto_map"]
+
+        for key in ["tokenizer", "qformer_tokenizer", "protein_tokenizer", "char_tokenizer", "chat_template"]:
+            output.pop(key, None)
 
         def save_public_processor_class(dictionary):
-            # make sure private name "_processor_class" is correctly
-            # saved as "processor_class"
             _processor_class = dictionary.pop("_processor_class", None)
             if _processor_class is not None:
                 dictionary["processor_class"] = _processor_class
@@ -687,33 +673,24 @@ def save_public_processor_class(dictionary):
             return dictionary
 
         def cast_array_to_list(dictionary):
-            """
-            Numpy arrays are not serialiazable but can be in pre-processing dicts.
-            This function casts arrays to list, recusring through the nested configs as well.
-            """
             for key, value in dictionary.items():
                 if isinstance(value, np.ndarray):
                     dictionary[key] = value.tolist()
                 elif isinstance(value, dict):
                     dictionary[key] = cast_array_to_list(value)
             return dictionary
 
-        # Special case, add `audio_tokenizer` dict which points to model weights and path
         if "audio_tokenizer" in output:
             audio_tokenizer_dict = {
                 "audio_tokenizer_class": self.audio_tokenizer.__class__.__name__,
                 "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path,
             }
             output["audio_tokenizer"] = audio_tokenizer_dict
 
-        # Serialize attributes as a dict
         output = {
             k: v.to_dict() if isinstance(v, PushToHubMixin) else v
             for k, v in output.items()
-            if (
-                k in attrs_to_save  # keep all attributes that have to be serialized
-                and v.__class__.__name__ != "BeamSearchDecoderCTC"  # remove attributes with that are objects
-            )
+            if k in attrs_to_save and v.__class__.__name__ != "BeamSearchDecoderCTC"
         }
         output = cast_array_to_list(output)
         output = save_public_processor_class(output)
@@ -794,10 +771,12 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
             if hasattr(attribute, "_set_processor_class"):
                 attribute._set_processor_class(self.__class__.__name__)
 
-            # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
-            if attribute_name == "tokenizer":
-                # Propagate save_jinja_files to tokenizer to ensure we don't get conflicts
-                attribute.save_pretrained(save_directory, save_jinja_files=save_jinja_files)
+            # if attribute is tokenizer, then save it in its own file for avoid overwriting
+            if hasattr(attribute, "save_pretrained"):
+                # use the attribute_name as prefix to create a unique file
+                attribute_save_dir = os.path.join(save_directory, attribute_name)
+                os.makedirs(attribute_save_dir, exist_ok=True)
+                attribute.save_pretrained(attribute_save_dir, save_jinja_files=save_jinja_files)
             elif attribute._auto_class is not None:
                 custom_object_save(attribute, save_directory, config=attribute)
 
@@ -1450,7 +1429,14 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
             else:
                 attribute_class = cls.get_possibly_dynamic_module(class_name)
 
-            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+            # updated loading path for handling multiple tokenizers 
+            attribute_path = os.path.join(pretrained_model_name_or_path, attribute_name)
+            if os.path.isdir(attribute_path):
+                # load from its attribute's-specific folder
+                args.append(attribute_class.from_pretrained(attribute_path, **kwargs))
+            else:
+                # now fallback to original path
+                args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
 
         return args
 

diff --git a/tests/test_processor_utils.py b/tests/test_processor_utils.py
@@ -0,0 +1,37 @@
+import tempfile
+
+from transformers.testing_utils import TestCasePlus
+from transformers import ProcessorMixin, AutoTokenizer, PreTrainedTokenizer
+
+
+class ProcessorSavePretrainedMultipleAttributes(TestCasePlus):
+    def test_processor_loads_separate_attributes(self):
+        class OtherProcessor(ProcessorMixin):
+            name = "other-processor"
+
+            attributes = [
+                "tokenizer1",
+                "tokenizer2",
+            ]
+            tokenizer1_class = "AutoTokenizer"
+            tokenizer2_class = "AutoTokenizer"
+
+            def __init__(self,
+                         tokenizer1: PreTrainedTokenizer,
+                         tokenizer2: PreTrainedTokenizer
+                         ):
+                super().__init__(tokenizer1=tokenizer1,
+                                 tokenizer2=tokenizer2)
+
+        tokenizer1 = AutoTokenizer.from_pretrained("google/gemma-3-270m")
+        tokenizer2 = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B")
+
+        processor = OtherProcessor(tokenizer1=tokenizer1,
+                                   tokenizer2=tokenizer2)
+        assert processor.tokenizer1.__class__ != processor.tokenizer2.__class__
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            processor.save_pretrained(save_directory=temp_dir, push_to_hub=False)
+            new_processor = OtherProcessor.from_pretrained(temp_dir)
+
+        assert new_processor.tokenizer1.__class__ != new_processor.tokenizer2.__class__