-
Notifications
You must be signed in to change notification settings - Fork 33.6k
multiple tokenizers with different filenames can save now #41837
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
73f91c5
c2ab93f
93731c5
c94405a
67c3bb7
8aca431
560067e
68caa7d
4b2c049
9e4b141
1ffb4d3
8203b0e
e1ce3e7
182ffd2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -654,30 +654,16 @@ def to_dict(self) -> dict[str, Any]: | |
| Returns: | ||
| `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance. | ||
| """ | ||
| output = copy.deepcopy(self.__dict__) | ||
| # shallow copy to avoid deepcopy errors | ||
| output = self.__dict__.copy() | ||
|
|
||
| # Get the kwargs in `__init__`. | ||
| sig = inspect.signature(self.__init__) | ||
| # Only save the attributes that are presented in the kwargs of `__init__`. | ||
| # or in the attributes | ||
| attrs_to_save = list(sig.parameters) + self.__class__.attributes | ||
| # extra attributes to be kept | ||
| attrs_to_save += ["auto_map"] | ||
|
|
||
| if "tokenizer" in output: | ||
| del output["tokenizer"] | ||
| if "qformer_tokenizer" in output: | ||
| del output["qformer_tokenizer"] | ||
| if "protein_tokenizer" in output: | ||
| del output["protein_tokenizer"] | ||
| if "char_tokenizer" in output: | ||
| del output["char_tokenizer"] | ||
| if "chat_template" in output: | ||
| del output["chat_template"] | ||
| attrs_to_save = list(sig.parameters) + self.__class__.attributes + ["auto_map"] | ||
|
|
||
| for key in ["tokenizer", "qformer_tokenizer", "protein_tokenizer", "char_tokenizer", "chat_template"]: | ||
| output.pop(key, None) | ||
|
|
||
| def save_public_processor_class(dictionary): | ||
| # make sure private name "_processor_class" is correctly | ||
| # saved as "processor_class" | ||
| _processor_class = dictionary.pop("_processor_class", None) | ||
| if _processor_class is not None: | ||
| dictionary["processor_class"] = _processor_class | ||
|
|
@@ -687,33 +673,24 @@ def save_public_processor_class(dictionary): | |
| return dictionary | ||
|
|
||
| def cast_array_to_list(dictionary): | ||
| """ | ||
| Numpy arrays are not serialiazable but can be in pre-processing dicts. | ||
| This function casts arrays to list, recusring through the nested configs as well. | ||
| """ | ||
|
aijadugar marked this conversation as resolved.
|
||
| for key, value in dictionary.items(): | ||
| if isinstance(value, np.ndarray): | ||
| dictionary[key] = value.tolist() | ||
| elif isinstance(value, dict): | ||
| dictionary[key] = cast_array_to_list(value) | ||
| return dictionary | ||
|
|
||
| # Special case, add `audio_tokenizer` dict which points to model weights and path | ||
|
aijadugar marked this conversation as resolved.
|
||
| if "audio_tokenizer" in output: | ||
| audio_tokenizer_dict = { | ||
| "audio_tokenizer_class": self.audio_tokenizer.__class__.__name__, | ||
| "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path, | ||
| } | ||
| output["audio_tokenizer"] = audio_tokenizer_dict | ||
|
|
||
| # Serialize attributes as a dict | ||
|
aijadugar marked this conversation as resolved.
|
||
| output = { | ||
| k: v.to_dict() if isinstance(v, PushToHubMixin) else v | ||
| for k, v in output.items() | ||
| if ( | ||
| k in attrs_to_save # keep all attributes that have to be serialized | ||
|
aijadugar marked this conversation as resolved.
|
||
| and v.__class__.__name__ != "BeamSearchDecoderCTC" # remove attributes with that are objects | ||
| ) | ||
| if k in attrs_to_save and v.__class__.__name__ != "BeamSearchDecoderCTC" | ||
| } | ||
| output = cast_array_to_list(output) | ||
| output = save_public_processor_class(output) | ||
|
|
@@ -794,10 +771,12 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): | |
| if hasattr(attribute, "_set_processor_class"): | ||
| attribute._set_processor_class(self.__class__.__name__) | ||
|
|
||
| # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json` | ||
| if attribute_name == "tokenizer": | ||
| # Propagate save_jinja_files to tokenizer to ensure we don't get conflicts | ||
| attribute.save_pretrained(save_directory, save_jinja_files=save_jinja_files) | ||
| # if attribute is tokenizer, then save it in its own file for avoid overwriting | ||
| if hasattr(attribute, "save_pretrained"): | ||
| # use the attribute_name as prefix to create a unique file | ||
| attribute_save_dir = os.path.join(save_directory, attribute_name) | ||
| os.makedirs(attribute_save_dir, exist_ok=True) | ||
| attribute.save_pretrained(attribute_save_dir, save_jinja_files=save_jinja_files) | ||
|
Comment on lines
+790
to
+792
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed evolla already uses the subdir approach. however I'd reserve that for multi-tokenizers models only, checking with |
||
| elif attribute._auto_class is not None: | ||
| custom_object_save(attribute, save_directory, config=attribute) | ||
|
|
||
|
|
@@ -1450,7 +1429,14 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs) | |
| else: | ||
| attribute_class = cls.get_possibly_dynamic_module(class_name) | ||
|
|
||
| args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) | ||
| # updated loading path for handling multiple tokenizers | ||
| attribute_path = os.path.join(pretrained_model_name_or_path, attribute_name) | ||
| if os.path.isdir(attribute_path): | ||
| # load from its attribute's-specific folder | ||
| args.append(attribute_class.from_pretrained(attribute_path, **kwargs)) | ||
| else: | ||
| # now fallback to original path | ||
| args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) | ||
|
|
||
| return args | ||
|
|
||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does not need to be in an entirely new file, can be in |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| import tempfile | ||
|
|
||
| from transformers.testing_utils import TestCasePlus | ||
| from transformers import ProcessorMixin, AutoTokenizer, PreTrainedTokenizer | ||
|
|
||
|
|
||
| class ProcessorSavePretrainedMultipleAttributes(TestCasePlus): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The test case |
||
| def test_processor_loads_separate_attributes(self): | ||
| class OtherProcessor(ProcessorMixin): | ||
| name = "other-processor" | ||
|
|
||
| attributes = [ | ||
| "tokenizer1", | ||
| "tokenizer2", | ||
| ] | ||
| tokenizer1_class = "AutoTokenizer" | ||
| tokenizer2_class = "AutoTokenizer" | ||
|
|
||
| def __init__(self, | ||
| tokenizer1: PreTrainedTokenizer, | ||
| tokenizer2: PreTrainedTokenizer | ||
| ): | ||
| super().__init__(tokenizer1=tokenizer1, | ||
| tokenizer2=tokenizer2) | ||
|
|
||
| tokenizer1 = AutoTokenizer.from_pretrained("google/gemma-3-270m") | ||
| tokenizer2 = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B") | ||
|
|
||
| processor = OtherProcessor(tokenizer1=tokenizer1, | ||
| tokenizer2=tokenizer2) | ||
| assert processor.tokenizer1.__class__ != processor.tokenizer2.__class__ | ||
|
|
||
| with tempfile.TemporaryDirectory() as temp_dir: | ||
| processor.save_pretrained(save_directory=temp_dir, push_to_hub=False) | ||
| new_processor = OtherProcessor.from_pretrained(temp_dir) | ||
|
|
||
| assert new_processor.tokenizer1.__class__ != new_processor.tokenizer2.__class__ | ||
Uh oh!
There was an error while loading. Please reload this page.