diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py index 781a4d4603a9..f1b66f752da4 100644 --- a/src/transformers/feature_extraction_utils.py +++ b/src/transformers/feature_extraction_utils.py @@ -79,7 +79,8 @@ def __init__( skip_tensor_conversion: list[str] | set[str] | None = None, ): super().__init__(data) - self.convert_to_tensors(tensor_type=tensor_type, skip_tensor_conversion=skip_tensor_conversion) + self.skip_tensor_conversion = skip_tensor_conversion + self.convert_to_tensors(tensor_type=tensor_type) def __getitem__(self, item: str) -> Any: """ @@ -178,6 +179,9 @@ def convert_to_tensors( return self is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type) + skip_tensor_conversion = ( + skip_tensor_conversion if skip_tensor_conversion is not None else self.skip_tensor_conversion + ) # Do the tensor conversion in batch for key, value in self.items(): diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 5778119424b4..934c4d9a2d3b 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2312,6 +2312,14 @@ def _initialize_weights(self, module): if getattr(module, "_is_hf_initialized", False): return + if ( + (weight := getattr(module, "weight", None)) is not None + and getattr(weight, "_is_hf_initialized", False) + and not list(module.named_buffers()) + ): + module._is_hf_initialized = True + return + self._init_weights(module) module._is_hf_initialized = True @@ -4202,6 +4210,9 @@ def _finalize_model_loading( missing keys from meta device to their expected device, reinitializing missing weights according to proper distributions, tying the weights and logging the loading report.""" try: + # Adjust `all_tied_weights_keys` before marking them as initialized + model._adjust_tied_keys_with_tied_pointers(loading_info.missing_and_mismatched()) + # Marks tied weights as `_is_hf_initialized` to avoid initializing them (it's very important for efficiency) model.mark_tied_weights_as_initialized() @@ -4417,6 +4428,35 @@ def get_compiled_call(self, compile_config: CompileConfig | None) -> Callable: def is_backend_compatible(cls): return cls._supports_attention_backend + def _adjust_tied_keys_with_tied_pointers(self, missing_keys: list[str]) -> None: + """ + Adds keys to `self.all_tied_weights_keys` by checking if any group of params + share the same data ptr. It helps us support remote code where the weight tying is + done in old-T5 style, by manually assigning the same module to different param names. + If we don't add them back in `self.all_tied_weights_keys`, they will be re-initialized + and all params in tied group get random weights. + """ + param_pointers = defaultdict(list) + for param_name, param_value in self.state_dict().items(): + param_pointers[param_value.data_ptr()].append(param_name) + + # Filter out params that are already in `self.all_tied_weights_keys` or if all + # are missing params. Missing param groups share the same data ptr by being on `meta` + tied_param_names = [ + names + for names in param_pointers.values() + if len(names) > 1 + and not any(name in self.all_tied_weights_keys.keys() for name in names) + and not all(name in missing_keys for name in names) + ] + + # Create a dummy mapping, it doesn't matter which one is source/target + # because they are already tied + tied_weights_keys_by_pointers = { + param_name: group[0] for group in tied_param_names for param_name in group[1:] + } + self.all_tied_weights_keys.update(tied_weights_keys_by_pointers) + def _move_missing_keys_from_meta_to_device( self, missing_keys: list[str], diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 08ec7cf4e6e3..3d2254e2acf7 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -264,6 +264,8 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): with longer `max_position_embeddings`. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -328,6 +330,7 @@ def __init__( pad_token_id: int | None = None, bos_token_id: int | None = None, eos_token_id: int | None = None, + tie_word_embeddings: bool | None = True, **kwargs, ): self.vocab_size = vocab_size @@ -342,6 +345,7 @@ def __init__( self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id + self.tie_word_embeddings = tie_word_embeddings # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index c8b70ed0807e..f1ff53230297 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -304,6 +304,8 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): with longer `max_position_embeddings`. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings Example: @@ -368,6 +370,7 @@ def __init__( pad_token_id: int | None = None, bos_token_id: int | None = None, eos_token_id: int | None = None, + tie_word_embeddings: bool | None = True, **kwargs, ): self.vocab_size = vocab_size @@ -382,6 +385,7 @@ def __init__( self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id + self.tie_word_embeddings = tie_word_embeddings # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 5924099efe20..8c1199cb3f7f 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -228,15 +228,13 @@ class methods and docstrings. Attributes: do_convert_rgb (`bool`): - Whether to convert the video to RGB format. + Whether to convert the image to RGB format. do_resize (`bool`, *optional*): Whether to resize the image. size (`dict[str, int]`, *optional*): Resize the shorter side of the input to `size["shortest_edge"]`. crop_size (`dict[str, int]`, *optional*): Desired output size when applying center-cropping. - do_convert_rgb (`bool`): - Whether to convert the video to RGB format. resample (`PILImageResampling`, *optional*): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*): diff --git a/src/transformers/tokenization_python.py b/src/transformers/tokenization_python.py index de6326a7ccd4..9f8702f5b2a1 100644 --- a/src/transformers/tokenization_python.py +++ b/src/transformers/tokenization_python.py @@ -433,7 +433,7 @@ def __init__(self, **kwargs): # 5. Special tokens mask configuration # Patterns: "none", "cls_sep", "eos", "bos", "bos_eos", "cls_double_sep", "prefix_suffix" - self.special_tokens_pattern = kwargs.pop("special_tokens_pattern", "cls_sep") + self.special_tokens_pattern = kwargs.pop("special_tokens_pattern", None) # 6. Set backend to "custom" if not already set (for direct PreTrainedTokenizer subclasses) if "backend" not in kwargs: @@ -883,30 +883,62 @@ def build_inputs_with_special_tokens( """ if self.special_tokens_pattern == "cls_sep": # [CLS] seq0 [SEP] or [CLS] seq0 [SEP] seq1 [SEP] + if self.cls_token_id is None and self.sep_token_id is None: + raise ValueError( + "Cannot add special tokens following 'cls_sep' pattern because one or several special tokens " + f"are not defined (cls_token_id={self.cls_token_id}; sep_token_id={self.sep_token_id})" + "Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`" + ) if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id] elif self.special_tokens_pattern == "eos": # seq0 [EOS] or seq0 [EOS] seq1 [EOS] + if self.eos_token_id is None: + raise ValueError( + "Cannot add special tokens following 'eos' pattern because eos token is not defined " + f"(eos_token_id={self.eos_token_id})." + "Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`" + ) if token_ids_1 is None: return token_ids_0 + [self.eos_token_id] return token_ids_0 + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] elif self.special_tokens_pattern == "bos": # [BOS] seq0 or [BOS] seq0 [BOS] seq1 + if self.bos_token_id is None: + raise ValueError( + "Cannot add special tokens following 'bos' pattern because bos token is not defined " + f"(bos_token_id={self.bos_token_id})." + "Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`" + ) if token_ids_1 is None: return [self.bos_token_id] + token_ids_0 return [self.bos_token_id] + token_ids_0 + [self.bos_token_id] + token_ids_1 elif self.special_tokens_pattern == "bos_eos": # [BOS] seq0 [EOS] or [BOS] seq0 [EOS] seq1 [EOS] + if self.bos_token_id is None and self.eos_token_id is None: + raise ValueError( + "Cannot add special tokens following 'bos_eos' pattern because one or several special tokens " + f"are not defined (bos_token_id={self.bos_token_id}; eos_token_id={self.eos_token_id})" + "Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`" + ) + return token_ids_0 if token_ids_1 is None else token_ids_0 + token_ids_1 + if token_ids_1 is None: return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] elif self.special_tokens_pattern == "cls_double_sep": # [CLS] seq0 [SEP] or [CLS] seq0 [SEP] [SEP] seq1 [SEP] + if self.cls_token_id is None and self.sep_token_id is None: + raise ValueError( + "Cannot add special tokens following 'cls_double_sep' pattern because one or several special tokens " + f"are not defined (cls_token_id={self.cls_token_id}; sep_token_id={self.sep_token_id})" + "Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`" + ) if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return (