diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py index 72642ad9cfab..82dffeebc411 100644 --- a/examples/modular-transformers/modeling_new_task_model.py +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -336,7 +336,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Replace image id with PAD if the image token if OOV, to avoid index-errors if input_ids is not None and self.config.image_token_id >= self.vocab_size: diff --git a/setup.py b/setup.py index 89c4375716e6..1dae98ed8a72 100644 --- a/setup.py +++ b/setup.py @@ -86,7 +86,7 @@ "fugashi>=1.0", "GitPython<3.1.19", "hf-doc-builder>=0.3.0", - "huggingface-hub>=1.3.0,<2.0", + "huggingface-hub>=1.5.0,<2.0", "ipadic>=1.0.0,<2.0", "jinja2>=3.1.0", "jmespath>=1.0.1", diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 859355d37925..408d53e5a24a 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -18,9 +18,12 @@ import json import math import os -from typing import TYPE_CHECKING, Any, TypeVar, Union +from collections.abc import Sequence +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, Union from huggingface_hub import create_repo +from huggingface_hub.dataclasses import strict from packaging import version from . import __version__ @@ -54,6 +57,21 @@ _FLOAT_TAG_VALUES = {"Infinity": float("inf"), "-Infinity": float("-inf"), "NaN": float("nan")} +ALLOWED_LAYER_TYPES = ( + "full_attention", + "sliding_attention", + "chunked_attention", + "linear_attention", # used in minimax + "conv", # used in LFMv2 + "mamba", + "attention", + "sparse", + "dense", +) + + +@strict(accept_kwargs=True) +@dataclass(repr=False) class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin): # no-format r""" @@ -143,107 +161,67 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin): `float16` weights. """ - model_type: str = "" - base_config_key: str = "" - sub_configs: dict[str, type["PreTrainedConfig"]] = {} - has_no_defaults_at_init: bool = False - attribute_map: dict[str, str] = {} - base_model_tp_plan: dict[str, Any] | None = None - base_model_pp_plan: dict[str, tuple[list[str]]] | None = None - base_model_ep_plan: dict[str, tuple[list[str]]] | None = None - _auto_class: str | None = None - - def __setattr__(self, key, value): - if key in super().__getattribute__("attribute_map"): - key = super().__getattribute__("attribute_map")[key] - super().__setattr__(key, value) - - def __getattribute__(self, key): - if key != "attribute_map" and key in super().__getattribute__("attribute_map"): - key = super().__getattribute__("attribute_map")[key] - return super().__getattribute__(key) - - def __init__( - self, - *, - # All models common arguments - output_hidden_states: bool = False, - output_attentions: bool = False, - return_dict: bool = True, - dtype: Union[str, "torch.dtype"] | None = None, - # Common arguments - chunk_size_feed_forward: int = 0, - is_encoder_decoder: bool = False, - # Fine-tuning task arguments - architectures: list[str] | None = None, - id2label: dict[int, str] | None = None, - label2id: dict[str, int] | None = None, - num_labels: int | None = None, - problem_type: str | None = None, - **kwargs, - ): - # Validation for some arguments - if label2id is not None and not isinstance(label2id, dict): - raise ValueError("Argument label2id should be a dictionary.") - if id2label is not None and not isinstance(id2label, dict): - raise ValueError("Argument id2label should be a dictionary.") - if num_labels is not None and id2label is not None and len(id2label) != num_labels: - logger.warning( - f"You passed `num_labels={num_labels}` which is incompatible to " - f"the `id2label` map of length `{len(id2label)}`." - ) - if problem_type is not None and problem_type not in ( - "regression", - "single_label_classification", - "multi_label_classification", - ): - raise ValueError( - f"The config parameter `problem_type` was not understood: received {problem_type} " - "but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid." - ) + base_config_key: ClassVar[str] = "" + sub_configs: ClassVar[dict[str, type["PreTrainedConfig"]]] = {} + has_no_defaults_at_init: ClassVar[bool] = False + keys_to_ignore_at_inference: ClassVar[list[str]] = [] + attribute_map: ClassVar[dict[str, str]] = {} + base_model_tp_plan: ClassVar[dict[str, Any] | None] = None + base_model_pp_plan: ClassVar[dict[str, Sequence[list[str]]] | None] = None + base_model_ep_plan: ClassVar[dict[str, Sequence[list[str]]] | None] = None + _auto_class: ClassVar[str | None] = None + + # Attributes set for all models internally when saving + model_type: ClassVar[str] = "" + transformers_version: ClassVar[str | None] = None + architectures: ClassVar[list[str] | None] = None + + # Common attributes for all models + output_hidden_states: bool | None = False + return_dict: bool | None = True + dtype: Union[str, "torch.dtype"] | None = None + chunk_size_feed_forward: int = 0 + is_encoder_decoder: bool = False + + # Fine-tuning task arguments + id2label: dict[int, str] | dict[str, str] | None = None + label2id: dict[str, int] | dict[str, str] | None = None + problem_type: Literal["regression", "single_label_classification", "multi_label_classification"] | None = None + + # Tokenizer kwargs + tokenizer_class: str | None = None + + def __post_init__(self, **kwargs): # BC for the `torch_dtype` argument instead of the simpler `dtype` # Do not warn, as it would otherwise always be triggered since most configs on the hub have `torch_dtype` if (torch_dtype := kwargs.pop("torch_dtype", None)) is not None: # If both are provided, keep `dtype` - dtype = dtype if dtype is not None else torch_dtype - if dtype is not None and isinstance(dtype, str) and is_torch_available(): + self.dtype = self.dtype if self.dtype is not None else torch_dtype + if self.dtype is not None and isinstance(self.dtype, str) and is_torch_available(): # we will start using self.dtype in v5, but to be consistent with # from_pretrained's dtype arg convert it to an actual torch.dtype object import torch - dtype = getattr(torch, dtype) - - # BC for rotary embeddings. We will pop out legacy keys from kwargs and rename to new format - if hasattr(self, "rope_parameters"): - ignore_keys_at_rope_validation = kwargs.pop("ignore_keys_at_rope_validation", None) - kwargs = self.convert_rope_params_to_dict( - ignore_keys_at_rope_validation=ignore_keys_at_rope_validation, **kwargs - ) - - # Attributes common for all models - self.return_dict = return_dict - self.output_hidden_states = output_hidden_states - self.dtype = dtype - self._output_attentions = output_attentions # has public property - - # Less common kwargs, only used by some models - self.chunk_size_feed_forward = chunk_size_feed_forward - - # Encoder-decoder models attributes - self.is_encoder_decoder = is_encoder_decoder - - # Fine-tuning task attributes - self.architectures = architectures - self.id2label = id2label - self.label2id = label2id - self.problem_type = problem_type + self.dtype = getattr(torch, self.dtype) + # Keep the default value of `num_labels=2` in case users have saved a classfier with 2 labels + # Our configs prev wouldn't save `id2label` for 2 labels because it is the default. In all other + # cases we expect the config dict to have an `id2label` field if it's a clf model, or not otherwise if self.id2label is None: - self._create_id_label_maps(num_labels if num_labels is not None else 2) + self.num_labels = kwargs.get("num_labels", 2) else: - # Keys are always strings in JSON so convert ids to int here. + if kwargs.get("num_labels") is not None and len(self.id2label) != kwargs.get("num_labels"): + logger.warning( + f"You passed `num_labels={kwargs.get('num_labels')}` which is incompatible to " + f"the `id2label` map of length `{len(self.id2label)}`." + ) + # Keys are always strings in JSON so convert ids to int self.id2label = {int(key): value for key, value in self.id2label.items()} + # BC for rotary embeddings. We will pop out legacy keys from kwargs and rename to new format + if hasattr(self, "rope_parameters"): + kwargs = self.convert_rope_params_to_dict(**kwargs) + # Parameters for sequence generation saved in the config are popped instead of loading them. for parameter_name in GenerationConfig._get_default_generation_params().keys(): kwargs.pop(parameter_name, None) @@ -252,14 +230,10 @@ def __init__( self._name_or_path = str(kwargs.pop("name_or_path", "")) self._commit_hash = kwargs.pop("_commit_hash", None) - # Attention implementation to use, if relevant (it sets it recursively on sub-configs) + # Attention/Experts implementation to use, if relevant (it sets it recursively on sub-configs) + self._output_attentions: bool | None = kwargs.pop("output_attentions", False) self._attn_implementation: str | None = kwargs.pop("attn_implementation", None) - - # Experts implementation to use, if relevant (it sets it recursively on sub-configs) - self._experts_implementation = kwargs.pop("experts_implementation", None) - - # Drop the transformers version info - self.transformers_version = kwargs.pop("transformers_version", None) + self._experts_implementation: str | None = kwargs.pop("experts_implementation", None) # Additional attributes without default values for key, value in kwargs.items(): @@ -271,9 +245,9 @@ def __init__( logger.error(f"Can't set {key} with value {value} for {self}") raise err - def _create_id_label_maps(self, num_labels: int): - self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)} - self.label2id = dict(zip(self.id2label.values(), self.id2label.keys())) + def __init_subclass__(cls, *args, **kwargs): + super().__init_subclass__(*args, **kwargs) + cls = dataclass(cls, repr=False) @property def name_or_path(self) -> str | None: @@ -283,6 +257,21 @@ def name_or_path(self) -> str | None: def name_or_path(self, value): self._name_or_path = str(value) # Make sure that name_or_path is a string (for JSON encoding) + @property + def num_labels(self) -> int: + """ + `int`: The number of labels for classification models. + """ + return len(self.id2label) if self.id2label is not None else None + + @num_labels.setter + def num_labels(self, num_labels: int): + # we do not store `num_labels` attribute in config, but instead + # compute it based on the length of the `id2label` map + if self.id2label is None or self.num_labels != num_labels: + self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)} + self.label2id = dict(zip(self.id2label.values(), self.id2label.keys())) + @property def output_attentions(self): """ @@ -302,27 +291,6 @@ def output_attentions(self, value: bool): ) self._output_attentions = value - @property - def use_return_dict(self) -> bool: - """ - `bool`: Whether or not return [`~utils.ModelOutput`] instead of tuples. - """ - return self.return_dict - - @property - def num_labels(self) -> int: - """ - `int`: The number of labels for classification models. - """ - return len(self.id2label) - - @num_labels.setter - def num_labels(self, num_labels: int): - # we do not store `num_labels` attribute in config, but instead - # compute it based on the length of the `id2label` map - if self.id2label is None or self.num_labels != num_labels: - self._create_id_label_maps(num_labels) - @property def _attn_implementation(self): return self._attn_implementation_internal @@ -372,11 +340,73 @@ def torch_dtype(self): logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!") return self.dtype + @property + def use_return_dict(self): + logger.warning_once("`use_return_dict` is deprecated! Use `return_dict` instead!") + return self.return_dict + @torch_dtype.setter def torch_dtype(self, value): logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!") self.dtype = value + def __setattr__(self, key, value): + if key in super().__getattribute__("attribute_map"): + key = super().__getattribute__("attribute_map")[key] + super().__setattr__(key, value) + + def __getattribute__(self, key): + if key != "attribute_map" and key in super().__getattribute__("attribute_map"): + key = super().__getattribute__("attribute_map")[key] + return super().__getattribute__(key) + + def validate_output_attentions(self): + if self.output_attentions and self._attn_implementation not in ["eager", None]: + raise ValueError( + "The `output_attentions` attribute is not supported when using the `attn_implementation` set to " + f"{self._attn_implementation}. Please set it to 'eager' instead." + ) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if ( + hasattr(self, "head_dim") + and hasattr(self, "num_heads") + and hasattr(self, "embed_dim") + and self.head_dim * self.num_heads != self.embed_dim + ): + raise ValueError( + f"The embed_dim ({self.embed_dim}) is not a multiple of the number of attention " + f"heads ({self.num_heads})." + ) + + def validate_token_ids(self): + """Part of `@strict`-powered validation. Validates the contents of the special tokens.""" + text_config = self.get_text_config(decoder=True) + vocab_size = getattr(text_config, "vocab_size", None) + if vocab_size is not None: + # Check for all special tokens, e..g. pad_token_id, image_token_id, audio_token_id + for value in text_config: + if value.endswith("_token_id") and isinstance(value, int) and not 0 <= value < vocab_size: + # Can't be an exception until we can load configs that fail validation: several configs on the Hub + # store invalid special tokens, e.g. `pad_token_id=-1` + logger.warning_once( + f"Model config: {value} must be `None` or an integer within the vocabulary (between 0 " + f"and {vocab_size - 1}), got {value}. This may result in unexpected behavior." + ) + + def validate_layer_type(self): + """Check that `layer_types` is correctly defined.""" + if not (getattr(self, "layer_types", None) is not None and hasattr(self, "num_hidden_layers")): + return + elif not all(layer_type in ALLOWED_LAYER_TYPES for layer_type in self.layer_types): + raise ValueError(f"The `layer_types` entries must be in {ALLOWED_LAYER_TYPES} but got {self.layer_types}") + elif self.num_hidden_layers is not None and self.num_hidden_layers != len(self.layer_types): + raise ValueError( + f"`num_hidden_layers` ({self.num_hidden_layers}) must be equal to the number of layer types " + f"({len(self.layer_types)})" + ) + @property def rope_scaling(self): return self.rope_parameters @@ -430,6 +460,11 @@ def save_pretrained(self, save_directory: str | os.PathLike, push_to_hub: bool = # If we save using the predefined names, we can load using `from_pretrained` output_config_file = os.path.join(save_directory, CONFIG_NAME) + # Strict validation at save-time: prevent bad patterns from propagating + # Using `strict` decorator guarantees that `self.validate` exists , but not all + # model config might have the decorator added + if hasattr(self, "validate"): + self.validate() self.to_json_file(output_config_file, use_diff=True) logger.info(f"Configuration saved in {output_config_file}") @@ -695,49 +730,44 @@ def from_dict( [`PreTrainedConfig`]: The configuration object instantiated from those parameters. """ return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) - # Those arguments may be passed along for our internal telemetry. - # We remove them so they don't appear in `return_unused_kwargs`. - kwargs.pop("_from_auto", None) - kwargs.pop("_from_pipeline", None) + # The commit hash might have been updated in the `config_dict`, we don't want the kwargs to erase that update. if "_commit_hash" in kwargs and "_commit_hash" in config_dict: - kwargs["_commit_hash"] = config_dict["_commit_hash"] - - # For BC on the old `torch_dtype` - if (torch_dtype := kwargs.pop("torch_dtype", None)) is not None: - logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!") - # If both are present, use `dtype` - kwargs["dtype"] = kwargs.get("dtype", torch_dtype) - - # We remove them from kwargs so that they do not appear in `return_unused_kwargs`. - config_dict["attn_implementation"] = kwargs.pop("attn_implementation", None) - config_dict["experts_implementation"] = kwargs.pop("experts_implementation", None) + kwargs.setdefault("_commit_hash", config_dict["_commit_hash"]) + + # To remove arg here are those passed along for our internal telemetry but we still need to remove them + to_remove = ["_from_auto", "_from_pipeline"] + valid_fields = [ + "num_labels", + "attn_implementation", + "experts_implementation", + "output_attentions", + "torch_dtype", + "dtype", + "name_or_path", + ] + for key, value in kwargs.items(): + if key in valid_fields: + if key not in ["torch_dtype", "dtype"]: + config_dict[key] = value + to_remove.append(key) + elif value != "auto": + config_dict[key] = value config = cls(**config_dict) - # Update config with kwargs if needed - if "num_labels" in kwargs and "id2label" in kwargs: - num_labels = kwargs["num_labels"] - id2label = kwargs["id2label"] if kwargs["id2label"] is not None else [] - if len(id2label) != num_labels: - raise ValueError( - f"You passed along `num_labels={num_labels}` with an incompatible id to label map: " - f"{kwargs['id2label']}. Since those arguments are inconsistent with each other, you should remove " - "one of them." - ) - to_remove = [] for key, value in kwargs.items(): if hasattr(config, key): current_attr = getattr(config, key) # To authorize passing a custom subconfig as kwarg in models that have nested configs. - # We need to update only custom kwarg values instead and keep other attributes in subconfig. + # We need to update only custom kwarg values instead and keep other attr in subconfig. if isinstance(current_attr, PreTrainedConfig) and isinstance(value, dict): current_attr_updated = current_attr.to_dict() current_attr_updated.update(value) value = current_attr.__class__(**current_attr_updated) setattr(config, key, value) - if key != "dtype": - to_remove.append(key) + to_remove.append(key) + for key in to_remove: kwargs.pop(key, None) @@ -902,12 +932,25 @@ def to_dict(self) -> dict[str, Any]: # Transformers version when serializing the model output["transformers_version"] = __version__ + # Pop "kwargs" since they are unpacked and set in the post init + output.pop("kwargs", None) + + def to_list(value): + if isinstance(value, tuple): + value = [to_list(item) for item in value] + return value + for key, value in output.items(): # Deal with nested configs like CLIP if isinstance(value, PreTrainedConfig): value = value.to_dict() del value["transformers_version"] + # Some models have defaults as tuples because dataclass + # doesn't allow mutables. Let's convert back to `list`` + elif isinstance(value, tuple): + value = to_list(value) + output[key] = value self._remove_keys_not_serialized(output) @@ -1029,24 +1072,21 @@ def _remove_keys_not_serialized(self, d: dict[str, Any]) -> None: Runs recursive check on the dict, to remove from all sub configs. """ - if "_is_quantized" in d: - del d["_is_quantized"] - if "_auto_class" in d: - del d["_auto_class"] + for key_to_remove in [ + "_is_quantized", + "_auto_class", + "_commit_hash", + "_attn_implementation_internal", + "_experts_implementation_internal", + "ignore_keys_at_rope_validation", + "base_model_tp_plan", + "base_model_pp_plan", + ]: + d.pop(key_to_remove, None) + if "_output_attentions" in d: d["output_attentions"] = d.pop("_output_attentions") - if "_commit_hash" in d: - del d["_commit_hash"] - if "_attn_implementation_internal" in d: - del d["_attn_implementation_internal"] - if "_experts_implementation_internal" in d: - del d["_experts_implementation_internal"] - # Do not serialize `base_model_tp_plan` for now - if "base_model_tp_plan" in d: - del d["base_model_tp_plan"] - # Do not serialize `base_model_pp_plan` for now - if "base_model_pp_plan" in d: - del d["base_model_pp_plan"] + for value in d.values(): if isinstance(value, dict): self._remove_keys_not_serialized(value) @@ -1139,12 +1179,11 @@ def get_text_config(self, decoder=None, encoder=None) -> "PreTrainedConfig": # handle legacy models with flat config structure, when we only want one of the configs if not return_both and len(valid_text_config_names) == 0 and config_to_return.is_encoder_decoder: config_to_return = copy.deepcopy(config_to_return) - prefix_to_discard = "encoder" if decoder else "decoder" prefix_to_keep = "decoder" if decoder else "encoder" for key in config_to_return.to_dict(): - # NOTE: We don't want to discard the key if it is mapped from a different attribute name at read time - if key.startswith(prefix_to_discard) and key not in config_to_return.attribute_map.values(): - delattr(config_to_return, key) + # NOTE: We can't discard keys because: + # 1) we can't truly delete a cls attribte on a dataclass; 2) we can't set the value to `None` due to + # strict validation. So we just keep it as is, since there are only a couple old models falling in this condition if key.startswith(prefix_to_keep): # [encoder/decoder]_layers -> num_hidden_layers if key == prefix_to_keep + "_layers": @@ -1226,28 +1265,3 @@ def recursive_diff_dict(dict_a, dict_b, config_obj=None): # The alias is only here for BC - we did not have the correct CamelCasing before PretrainedConfig = PreTrainedConfig - - -ALLOWED_ATTENTION_LAYER_TYPES = ( - "full_attention", - "sliding_attention", - "chunked_attention", - "linear_attention", # used in minimax -) - -ALLOWED_MLP_LAYER_TYPES = ( - "sparse", - "dense", -) - - -def layer_type_validation(layer_types: list[str], num_hidden_layers: int | None = None, attention: bool = True): - """Check that `layer_types` is correctly defined.""" - allowed_layer_types = ALLOWED_ATTENTION_LAYER_TYPES if attention else ALLOWED_MLP_LAYER_TYPES - if not all(layer_type in allowed_layer_types for layer_type in layer_types): - raise ValueError(f"The `layer_types` entries must be in {allowed_layer_types}") - if num_hidden_layers is not None and num_hidden_layers != len(layer_types): - raise ValueError( - f"`num_hidden_layers` ({num_hidden_layers}) must be equal to the number of layer types " - f"({len(layer_types)})" - ) diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 966885e2ba01..6346016e4703 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -18,7 +18,7 @@ "fugashi": "fugashi>=1.0", "GitPython": "GitPython<3.1.19", "hf-doc-builder": "hf-doc-builder>=0.3.0", - "huggingface-hub": "huggingface-hub>=1.3.0,<2.0", + "huggingface-hub": "huggingface-hub>=1.5.0,<2.0", "ipadic": "ipadic>=1.0.0,<2.0", "jinja2": "jinja2>=3.1.0", "jmespath": "jmespath>=1.0.1", diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 476fb03785d4..ac17573bdbc8 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -628,8 +628,9 @@ class RotaryEmbeddingConfigMixin: """ default_theta = 10_000.0 + ignore_keys_at_rope_validation = set() - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None = None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} @@ -645,13 +646,9 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None partial_rotary_factor = kwargs.get("partial_rotary_factor", getattr(self, "partial_rotary_factor", None)) if partial_rotary_factor is not None: self.rope_parameters.setdefault("partial_rotary_factor", partial_rotary_factor) - ignore_keys_at_rope_validation = ( - set() if ignore_keys_at_rope_validation is None else set(ignore_keys_at_rope_validation) - ) - ignore_keys_at_rope_validation = ignore_keys_at_rope_validation | {"partial_rotary_factor"} + self.ignore_keys_at_rope_validation = self.ignore_keys_at_rope_validation | {"partial_rotary_factor"} self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs def standardize_rope_params(self): @@ -702,11 +699,11 @@ def standardize_rope_params(self): self.rope_parameters = rope_parameters - def validate_rope(self: "PreTrainedConfig", ignore_keys: set | None = None): + def validate_rope(self: "PreTrainedConfig"): """ Validate the RoPE config arguments, given a `"PreTrainedConfig"` object """ - rope_parameters_dict = self.rope_parameters + rope_parameters_dict = getattr(self, "rope_parameters", None) if rope_parameters_dict is None: return @@ -723,7 +720,7 @@ def validate_rope(self: "PreTrainedConfig", ignore_keys: set | None = None): rope_parameters["rope_type"] = rope_type if validation_fn is not None: - validation_fn(rope_parameters, ignore_keys=ignore_keys) + validation_fn(rope_parameters, ignore_keys=self.ignore_keys_at_rope_validation) else: logger.warning( f"Missing validation function in 'RotaryEmbeddingConfigMixin' for 'rope_type'='{rope_type}'" @@ -942,4 +939,4 @@ def rope_config_validation(config: RotaryEmbeddingConfigMixin, ignore_keys: set FutureWarning, ) config.standardize_rope_params() - config.validate_rope(ignore_keys=ignore_keys) + config.validate_rope() diff --git a/src/transformers/models/afmoe/configuration_afmoe.py b/src/transformers/models/afmoe/configuration_afmoe.py index cac9209dfb1b..70c9a8e7c1fc 100644 --- a/src/transformers/models/afmoe/configuration_afmoe.py +++ b/src/transformers/models/afmoe/configuration_afmoe.py @@ -13,14 +13,14 @@ # limitations under the License. """AFMoE model configuration""" -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters +from ...utils import auto_docstring +@strict(accept_kwargs=True) @auto_docstring( custom_intro=""" AFMoE is an Adaptive Feedforward MoE (Mixture of Experts) model with token-choice routing, shared experts, and a @@ -64,85 +64,47 @@ class AfmoeConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 200192, - hidden_size: int | None = 2048, - intermediate_size: int | None = 6144, - moe_intermediate_size: int | None = 1408, - num_hidden_layers: int | None = 32, - num_dense_layers: int | None = 1, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = None, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 16384, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_theta: float | None = 10000.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - num_experts: int | None = 64, - num_experts_per_tok: int | None = 6, - num_shared_experts: int | None = 2, - route_scale: float | None = 1.0, - global_attn_every_n_layers: int | None = 4, - sliding_window: int | None = 1024, - layer_types: list | None = None, - attention_dropout: float | None = 0.0, - mup_enabled: bool | None = False, - eos_token_id: bool | None = None, - pad_token_id: bool | None = None, - bos_token_id: bool | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_dense_layers = num_dense_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_parameters = rope_parameters - - # MoE specific - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.num_shared_experts = num_shared_experts - self.route_scale = route_scale - self.attention_bias = False - - # Attention specific - self.attention_dropout = attention_dropout - self.global_attn_every_n_layers = global_attn_every_n_layers - self.sliding_window = sliding_window - self.mup_enabled = mup_enabled - self.layer_types = layer_types + vocab_size: int = 200192 + hidden_size: int = 2048 + intermediate_size: int = 6144 + moe_intermediate_size: int = 1408 + num_hidden_layers: int = 32 + num_dense_layers: int | None = 1 + num_attention_heads: int = 16 + num_key_value_heads: int | None = None + head_dim: int | None = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 16384 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + num_experts: int | None = 64 + num_experts_per_tok: int | None = 6 + num_shared_experts: int | None = 2 + route_scale: float | None = 1.0 + global_attn_every_n_layers: int | None = 4 + sliding_window: int | None = 1024 + layer_types: list | None = None + attention_dropout: float | int | None = 0.0 + mup_enabled: bool | None = False + eos_token_id: int | list[int] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + attention_bias: bool = False + + def __post_init__(self, **kwargs): if self.layer_types is None: self.layer_types = [ - "sliding_attention" if bool((i + 1) % global_attn_every_n_layers) else "full_attention" + "sliding_attention" if bool((i + 1) % self.global_attn_every_n_layers) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types) - - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.tie_word_embeddings = tie_word_embeddings + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["AfmoeConfig"] diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py index 2a53f4112a4a..9c532f6d19d3 100644 --- a/src/transformers/models/aimv2/configuration_aimv2.py +++ b/src/transformers/models/aimv2/configuration_aimv2.py @@ -18,6 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -26,6 +28,7 @@ @auto_docstring(checkpoint="apple/aimv2-large-patch14-224-lit") +@strict(accept_kwargs=True) class Aimv2VisionConfig(PreTrainedConfig): r""" use_head (`str`, *optional*, defaults to `True`): @@ -51,46 +54,25 @@ class Aimv2VisionConfig(PreTrainedConfig): model_type = "aimv2_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size: int = 1024, - intermediate_size: int = 2816, - num_hidden_layers: int = 24, - num_attention_heads: int = 8, - num_channels: int = 3, - image_size: int = 224, - patch_size: int = 14, - rms_norm_eps: float = 1e-5, - attention_dropout: float = 0.0, - qkv_bias: bool = False, - mlp_bias: bool = False, - hidden_act: str = "silu", - initializer_range: float = 0.02, - use_head: bool = True, - is_native: bool = False, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.hidden_act = hidden_act - - self.use_head = use_head - self.initializer_range = initializer_range - self.mlp_bias = mlp_bias - self.qkv_bias = qkv_bias - self.rms_norm_eps = rms_norm_eps - self.is_native = is_native + hidden_size: int = 1024 + intermediate_size: int = 2816 + num_hidden_layers: int = 24 + num_attention_heads: int = 8 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 14 + hidden_act: str = "silu" + attention_dropout: float | int = 0.0 + rms_norm_eps: float = 1e-5 + qkv_bias: bool = False + mlp_bias: bool = False + initializer_range: float = 0.02 + use_head: bool = True + is_native: bool = False @auto_docstring(checkpoint="apple/aimv2-large-patch14-224-lit") +@strict(accept_kwargs=True) class Aimv2TextConfig(PreTrainedConfig): r""" Example: @@ -110,45 +92,31 @@ class Aimv2TextConfig(PreTrainedConfig): model_type = "aimv2_text_model" base_config_key = "text_config" - - def __init__( - self, - vocab_size: int = 49408, - hidden_size: int = 768, - intermediate_size: int = 2048, - num_hidden_layers: int = 12, - num_attention_heads: int = 6, - rms_norm_eps: float = 1e-5, - attention_dropout: float = 0.0, - qkv_bias: bool = False, - mlp_bias: bool = False, - hidden_act: str = "silu", - eos_token_id: int = 49407, - max_position_embeddings: int = 77, - initializer_range: bool = 0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.hidden_act = hidden_act - self.attention_dropout = attention_dropout - - self.initializer_range = initializer_range - self.mlp_bias = mlp_bias - self.qkv_bias = qkv_bias - self.rms_norm_eps = rms_norm_eps + vocab_size: int = 49408 + hidden_size: int = 768 + intermediate_size: int = 2048 + num_hidden_layers: int = 12 + num_attention_heads: int = 6 + max_position_embeddings: int = 77 + hidden_act: str = "silu" + attention_dropout: float | int = 0.0 + eos_token_id: int | list[int] | None = 49407 + rms_norm_eps: float = 1e-5 + qkv_bias: bool = False + mlp_bias: bool = False + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) @auto_docstring(checkpoint="apple/aimv2-large-patch14-224-lit") +@strict(accept_kwargs=True) class Aimv2Config(PreTrainedConfig): r""" + max_logit_scale (`float`, *optional*, defaults to `100.0`): + The maximum logit scale to use + Example: ```python @@ -176,28 +144,28 @@ class Aimv2Config(PreTrainedConfig): model_type = "aimv2" sub_configs = {"text_config": Aimv2TextConfig, "vision_config": Aimv2VisionConfig} - def __init__( - self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs - ): - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.max_logit_scale = 100.0 - if text_config is None: - text_config = Aimv2TextConfig() + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + initializer_factor: float = 1.0 + + projection_dim: int = 512 + logit_scale_init_value: float = 2.6592 + max_logit_scale: float = 100.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = Aimv2TextConfig() logger.info("`text_config` is `None`. Initializing the `Aimv2TextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = Aimv2TextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = Aimv2TextConfig(**self.text_config) - if vision_config is None: - vision_config = Aimv2VisionConfig() + if self.vision_config is None: + self.vision_config = Aimv2VisionConfig() logger.info("`vision_config` is `None`. initializing the `Aimv2VisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = Aimv2VisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config + elif isinstance(self.vision_config, dict): + self.vision_config = Aimv2VisionConfig(**self.vision_config) - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Aimv2Config", "Aimv2VisionConfig", "Aimv2TextConfig"] diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py index 5aa0ac9752b5..306904441a26 100644 --- a/src/transformers/models/aimv2/modular_aimv2.py +++ b/src/transformers/models/aimv2/modular_aimv2.py @@ -18,19 +18,17 @@ import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init +from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack -from ...utils import ( - TransformersKwargs, - auto_docstring, - can_return_tuple, -) +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple from ...utils.generic import merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm @@ -40,6 +38,7 @@ @auto_docstring(checkpoint="apple/aimv2-large-patch14-224-lit") +@strict(accept_kwargs=True) class Aimv2VisionConfig(SiglipVisionConfig): r""" use_head (`str`, *optional*, defaults to `True`): @@ -62,95 +61,53 @@ class Aimv2VisionConfig(SiglipVisionConfig): >>> configuration = model.config ```""" - def __init__( - self, - hidden_size: int = 1024, - intermediate_size: int = 2816, - num_hidden_layers: int = 24, - num_attention_heads: int = 8, - num_channels: int = 3, - image_size: int = 224, - patch_size: int = 14, - rms_norm_eps: float = 1e-5, - attention_dropout: float = 0.0, - qkv_bias: bool = False, - mlp_bias: bool = False, - hidden_act: str = "silu", - initializer_range: float = 0.02, - use_head: bool = True, - is_native: bool = False, - **kwargs, - ): - super().__init__( - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - hidden_act=hidden_act, - num_channels=num_channels, - image_size=image_size, - patch_size=patch_size, - qkv_bias=qkv_bias, - **kwargs, - ) - - self.use_head = use_head - self.initializer_range = initializer_range - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.qkv_bias = qkv_bias - self.rms_norm_eps = rms_norm_eps - self.is_native = is_native + hidden_size: int = 1024 + intermediate_size: int = 2816 + num_hidden_layers: int = 24 + num_attention_heads: int = 8 + patch_size: int | list[int] | tuple[int, int] = 14 + rms_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + qkv_bias: bool = False + mlp_bias: bool = False + hidden_act: str = "silu" + initializer_range: float = 0.02 + use_head: bool = True + is_native: bool = False - del self.layer_norm_eps + layer_norm_eps = AttributeError() @auto_docstring(checkpoint="apple/aimv2-large-patch14-224-lit") +@strict(accept_kwargs=True) class Aimv2TextConfig(SiglipTextConfig): - def __init__( - self, - vocab_size: int = 49408, - hidden_size: int = 768, - intermediate_size: int = 2048, - num_hidden_layers: int = 12, - num_attention_heads: int = 6, - rms_norm_eps: float = 1e-5, - attention_dropout: float = 0.0, - qkv_bias: bool = False, - mlp_bias: bool = False, - hidden_act: str = "silu", - eos_token_id: int = 49407, - max_position_embeddings: int = 77, - initializer_range: bool = 0.02, - **kwargs, - ): - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - eos_token_id=eos_token_id, - **kwargs, - ) - - self.initializer_range = initializer_range - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.qkv_bias = qkv_bias - self.rms_norm_eps = rms_norm_eps - - del self.bos_token_id - del self.pad_token_id - del self.projection_size - del self.layer_norm_eps + vocab_size: int = 49408 + hidden_size: int = 768 + intermediate_size: int = 2048 + num_hidden_layers: int = 12 + num_attention_heads: int = 6 + max_position_embeddings: int = 77 + hidden_act: str = "silu" + rms_norm_eps: float = 1e-5 + qkv_bias: bool = False + mlp_bias: bool = False + initializer_range: float = 0.02 + bos_token_id = AttributeError() + pad_token_id = AttributeError() + layer_norm_eps = AttributeError() + projection_size = AttributeError() + + def __post_init__(self, **kwargs): + PreTrainedConfig.__post_init__(**kwargs) @auto_docstring(checkpoint="apple/aimv2-large-patch14-224-lit") +@strict(accept_kwargs=True) class Aimv2Config(SiglipConfig): r""" + max_logit_scale (`float`, *optional*, defaults to `100.0`): + The maximum logit scale to use + Example: ```python @@ -175,15 +132,9 @@ class Aimv2Config(SiglipConfig): >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision) ```""" - def __init__( - self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs - ): - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.max_logit_scale = 100.0 - super().__init__(text_config, vision_config, **kwargs) - - del self.initializer_factor + projection_dim: int = 512 + logit_scale_init_value: float = 2.6592 + max_logit_scale: float = 100.0 class Aimv2Output(SiglipOutput): diff --git a/src/transformers/models/albert/configuration_albert.py b/src/transformers/models/albert/configuration_albert.py index 2d9ce226fa09..41ebe62daa99 100644 --- a/src/transformers/models/albert/configuration_albert.py +++ b/src/transformers/models/albert/configuration_albert.py @@ -14,11 +14,14 @@ # limitations under the License. """ALBERT model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="albert/albert-xxlarge-v2") +@strict(accept_kwargs=True) class AlbertConfig(PreTrainedConfig): r""" inner_group_num (`int`, *optional*, defaults to 1): @@ -50,52 +53,26 @@ class AlbertConfig(PreTrainedConfig): model_type = "albert" - def __init__( - self, - vocab_size=30000, - embedding_size=128, - hidden_size=4096, - num_hidden_layers=12, - num_hidden_groups=1, - num_attention_heads=64, - intermediate_size=16384, - inner_group_num=1, - hidden_act="gelu_new", - hidden_dropout_prob=0, - attention_probs_dropout_prob=0, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - classifier_dropout_prob=0.1, - pad_token_id=0, - bos_token_id=2, - eos_token_id=3, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.vocab_size = vocab_size - self.embedding_size = embedding_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_hidden_groups = num_hidden_groups - self.num_attention_heads = num_attention_heads - self.inner_group_num = inner_group_num - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.classifier_dropout_prob = classifier_dropout_prob + vocab_size: int = 30000 + embedding_size: int = 128 + hidden_size: int = 4096 + num_hidden_layers: int = 12 + num_hidden_groups: int = 1 + num_attention_heads: int = 64 + intermediate_size: int = 16384 + inner_group_num: int = 1 + hidden_act: str = "gelu_new" + hidden_dropout_prob: int | float = 0.0 + attention_probs_dropout_prob: int | float = 0.0 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + classifier_dropout_prob: int | float = 0.1 + pad_token_id: int | None = 0 + bos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 3 + tie_word_embeddings: bool = True __all__ = ["AlbertConfig"] diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py index dbb136e1471f..90730d237ce9 100644 --- a/src/transformers/models/align/configuration_align.py +++ b/src/transformers/models/align/configuration_align.py @@ -13,6 +13,8 @@ # limitations under the License. """ALIGN model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="kakaobrain/align-base") +@strict(accept_kwargs=True) class AlignTextConfig(PreTrainedConfig): r""" Example: @@ -41,45 +44,25 @@ class AlignTextConfig(PreTrainedConfig): model_type = "align_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=None, - eos_token_id=None, - **kwargs, - ): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None @auto_docstring(checkpoint="kakaobrain/align-base") +@strict(accept_kwargs=True) class AlignVisionConfig(PreTrainedConfig): r""" width_coefficient (`float`, *optional*, defaults to 2.0): @@ -134,56 +117,45 @@ class AlignVisionConfig(PreTrainedConfig): model_type = "align_vision_model" base_config_key = "vision_config" - def __init__( - self, - num_channels: int = 3, - image_size: int = 600, - width_coefficient: float = 2.0, - depth_coefficient: float = 3.1, - depth_divisor: int = 8, - kernel_sizes: list[int] = [3, 3, 5, 3, 5, 5, 3], - in_channels: list[int] = [32, 16, 24, 40, 80, 112, 192], - out_channels: list[int] = [16, 24, 40, 80, 112, 192, 320], - depthwise_padding: list[int] = [], - strides: list[int] = [1, 2, 2, 2, 1, 2, 1], - num_block_repeats: list[int] = [1, 2, 2, 3, 3, 4, 1], - expand_ratios: list[int] = [1, 6, 6, 6, 6, 6, 6], - squeeze_expansion_ratio: float = 0.25, - hidden_act: str = "swish", - hidden_dim: int = 2560, - pooling_type: str = "mean", - initializer_range: float = 0.02, - batch_norm_eps: float = 0.001, - batch_norm_momentum: float = 0.99, - drop_connect_rate: float = 0.2, - **kwargs, - ): - super().__init__(**kwargs) - - self.num_channels = num_channels - self.image_size = image_size - self.width_coefficient = width_coefficient - self.depth_coefficient = depth_coefficient - self.depth_divisor = depth_divisor - self.kernel_sizes = kernel_sizes - self.in_channels = in_channels - self.out_channels = out_channels - self.depthwise_padding = depthwise_padding - self.strides = strides - self.num_block_repeats = num_block_repeats - self.expand_ratios = expand_ratios - self.squeeze_expansion_ratio = squeeze_expansion_ratio - self.hidden_act = hidden_act - self.hidden_dim = hidden_dim - self.pooling_type = pooling_type - self.initializer_range = initializer_range - self.batch_norm_eps = batch_norm_eps - self.batch_norm_momentum = batch_norm_momentum - self.drop_connect_rate = drop_connect_rate - self.num_hidden_layers = sum(num_block_repeats) * 4 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 600 + width_coefficient: float = 2.0 + depth_coefficient: float = 3.1 + depth_divisor: int = 8 + kernel_sizes: list[int] | tuple[int, ...] = (3, 3, 5, 3, 5, 5, 3) + in_channels: list[int] | tuple[int, ...] = (32, 16, 24, 40, 80, 112, 192) + out_channels: list[int] | tuple[int, ...] = (16, 24, 40, 80, 112, 192, 320) + depthwise_padding: list | tuple[int, ...] = () + strides: list[int] | tuple[int, ...] = (1, 2, 2, 2, 1, 2, 1) + num_block_repeats: list[int] | tuple[int, ...] = (1, 2, 2, 3, 3, 4, 1) + expand_ratios: list[int] | tuple[int, ...] = (1, 6, 6, 6, 6, 6, 6) + squeeze_expansion_ratio: float = 0.25 + hidden_act: str = "swish" + hidden_dim: int = 2560 + pooling_type: str = "mean" + initializer_range: float = 0.02 + batch_norm_eps: float = 0.001 + batch_norm_momentum: float = 0.99 + drop_connect_rate: float = 0.2 + + def __post_init__(self, **kwargs): + self.num_hidden_layers = sum(self.num_block_repeats) * 4 + for attr in [ + "kernel_sizes", + "in_channels", + "out_channels", + "depthwise_padding", + "strides", + "num_block_repeats", + "expand_ratios", + ]: + # cast tuple so it can be JSON-ized when saving + setattr(self, attr, list(getattr(self, attr))) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="kakaobrain/align-base") +@strict(accept_kwargs=True) class AlignConfig(PreTrainedConfig): r""" temperature_init_value (`float`, *optional*, defaults to 1.0): @@ -216,34 +188,26 @@ class AlignConfig(PreTrainedConfig): model_type = "align" sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig} - def __init__( - self, - text_config=None, - vision_config=None, - projection_dim=640, - temperature_init_value=1.0, - initializer_range=0.02, - **kwargs, - ): - if text_config is None: - text_config = AlignTextConfig() + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + projection_dim: int = 640 + temperature_init_value: float = 1.0 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = AlignTextConfig() logger.info("`text_config` is `None`. Initializing the `AlignTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = AlignTextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = AlignTextConfig(**self.text_config) - if vision_config is None: - vision_config = AlignVisionConfig() + if self.vision_config is None: + self.vision_config = AlignVisionConfig() logger.info("`vision_config` is `None`. initializing the `AlignVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = AlignVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config + elif isinstance(self.vision_config, dict): + self.vision_config = AlignVisionConfig(**self.vision_config) - self.projection_dim = projection_dim - self.temperature_init_value = temperature_init_value - self.initializer_range = initializer_range - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["AlignTextConfig", "AlignVisionConfig", "AlignConfig"] diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py index b280ace0cf59..62714e187c5b 100755 --- a/src/transformers/models/altclip/configuration_altclip.py +++ b/src/transformers/models/altclip/configuration_altclip.py @@ -13,6 +13,8 @@ # limitations under the License. """AltCLIP model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="BAAI/AltCLIP") +@strict(accept_kwargs=True) class AltCLIPTextConfig(PreTrainedConfig): r""" project_dim (`int`, *optional*, defaults to 768): @@ -43,49 +46,27 @@ class AltCLIPTextConfig(PreTrainedConfig): model_type = "altclip_text_model" - def __init__( - self, - vocab_size=250002, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - intermediate_size=4096, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=514, - type_vocab_size=1, - initializer_range=0.02, - initializer_factor=0.02, - layer_norm_eps=1e-05, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - project_dim=768, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.layer_norm_eps = layer_norm_eps - self.project_dim = project_dim + vocab_size: int = 250002 + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + intermediate_size: int = 4096 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 514 + type_vocab_size: int = 1 + initializer_range: float = 0.02 + initializer_factor: float = 0.02 + layer_norm_eps: float = 1e-05 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + project_dim: int = 768 @auto_docstring(checkpoint="BAAI/AltCLIP") +@strict(accept_kwargs=True) class AltCLIPVisionConfig(PreTrainedConfig): r""" Example: @@ -106,41 +87,23 @@ class AltCLIPVisionConfig(PreTrainedConfig): model_type = "altclip_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=32, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act + hidden_size: int = 768 + intermediate_size: int = 3072 + projection_dim: int = 512 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 32 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 @auto_docstring(checkpoint="BAAI/AltCLIP") +@strict(accept_kwargs=True) class AltCLIPConfig(PreTrainedConfig): r""" Example: @@ -168,23 +131,37 @@ class AltCLIPConfig(PreTrainedConfig): model_type = "altclip" sub_configs = {"text_config": AltCLIPTextConfig, "vision_config": AltCLIPVisionConfig} + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + projection_dim: int = 768 + logit_scale_init_value: float = 2.6592 + initializer_factor: float = 1.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `AltCLIPTextConfig` with default values.") + elif isinstance(self.text_config, AltCLIPTextConfig): + text_config = self.text_config.to_dict() + else: + text_config = self.text_config - def __init__( - self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs - ): - # If `_config_dict` exist, we use them for the backward compatibility. - # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot - # of confusion!). - text_config_dict = kwargs.pop("text_config_dict", None) - vision_config_dict = kwargs.pop("vision_config_dict", None) + if self.vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. initializing the `AltCLIPVisionConfig` with default values.") + elif isinstance(self.vision_config, AltCLIPVisionConfig): + vision_config = self.vision_config.to_dict() + else: + vision_config = self.vision_config + # For backward compatibility check keyword args # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. - if text_config_dict is not None: - if text_config is None: - text_config = {} + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + if text_config_dict is not None: # This is the complete result when using `text_config_dict`. _text_config_dict = AltCLIPTextConfig(**text_config_dict).to_dict() @@ -209,9 +186,6 @@ def __init__( text_config.update(_text_config_dict) if vision_config_dict is not None: - if vision_config is None: - vision_config = {} - # This is the complete result when using `vision_config_dict`. _vision_config_dict = AltCLIPVisionConfig(**vision_config_dict).to_dict() # convert keys to string instead of integer @@ -240,25 +214,11 @@ def __init__( # Update all values in `vision_config` with the ones in `_vision_config_dict`. vision_config.update(_vision_config_dict) - if text_config is None: - text_config = AltCLIPTextConfig() - logger.info("`text_config` is `None`. Initializing the `AltCLIPTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = AltCLIPTextConfig(**text_config) - - if vision_config is None: - vision_config = AltCLIPVisionConfig() - logger.info("`vision_config` is `None`. initializing the `AltCLIPVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = AltCLIPVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config + # Finally we can convert back our unified text/vision configs to `PretrainedConfig` + self.text_config = AltCLIPTextConfig(**text_config) + self.vision_config = AltCLIPVisionConfig(**vision_config) - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = 1.0 - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["AltCLIPTextConfig", "AltCLIPVisionConfig", "AltCLIPConfig"] diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index edaa1bc7c46e..f52197cf0d6d 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -18,12 +18,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="swiss-ai/Apertus-8B-Instruct-2509") +@strict(accept_kwargs=True) class ApertusConfig(PreTrainedConfig): r""" ```python @@ -58,60 +61,39 @@ class ApertusConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 131072, - hidden_size: int | None = 4096, - intermediate_size: int | None = 14336, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "xielu", - max_position_embeddings: int | None = 65536, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = 3, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | None = { - "rope_type": "llama3", - "rope_theta": 12000000.0, - "factor": 8.0, - "original_max_position_embeddings": 8192, - "low_freq_factor": 1.0, - "high_freq_factor": 4.0, - }, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + vocab_size: int = 131072 + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "xielu" + max_position_embeddings: int = 65536 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = 3 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + if self.rope_parameters is None: + self.rope_parameters = { + "rope_type": "llama3", + "rope_theta": 12000000.0, + "factor": 8.0, + "original_max_position_embeddings": 8192, + "low_freq_factor": 1.0, + "high_freq_factor": 4.0, + } + super().__post_init__(**kwargs) __all__ = ["ApertusConfig"] diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index fd670b609f13..18af25030c41 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -15,6 +15,7 @@ from collections.abc import Callable import torch +from huggingface_hub.dataclasses import strict from torch import nn from ...activations import ACT2CLS @@ -43,6 +44,7 @@ @auto_docstring(checkpoint="swiss-ai/Apertus-8B-Instruct-2509") +@strict(accept_kwargs=True) class ApertusConfig(PreTrainedConfig): r""" ```python @@ -77,60 +79,39 @@ class ApertusConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 131072, - hidden_size: int | None = 4096, - intermediate_size: int | None = 14336, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "xielu", - max_position_embeddings: int | None = 65536, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = 3, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | None = { - "rope_type": "llama3", - "rope_theta": 12000000.0, - "factor": 8.0, - "original_max_position_embeddings": 8192, - "low_freq_factor": 1.0, - "high_freq_factor": 4.0, - }, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 131072 + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "xielu" + max_position_embeddings: int = 65536 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = 3 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + if self.rope_parameters is None: + self.rope_parameters = { + "rope_type": "llama3", + "rope_theta": 12000000.0, + "factor": 8.0, + "original_max_position_embeddings": 8192, + "low_freq_factor": 1.0, + "high_freq_factor": 4.0, + } + super().__post_init__(**kwargs) class ApertusMLP(NemotronMLP): diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py index 1546b90e58cc..1687f77ae63c 100644 --- a/src/transformers/models/arcee/configuration_arcee.py +++ b/src/transformers/models/arcee/configuration_arcee.py @@ -18,6 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from transformers.utils import auto_docstring from ...configuration_utils import PreTrainedConfig @@ -25,6 +27,7 @@ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B") +@strict(accept_kwargs=True) class ArceeConfig(PreTrainedConfig): r""" ```python @@ -56,57 +59,42 @@ class ArceeConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 2560, - intermediate_size: int | None = 18432, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "relu2", - max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 128000, - eos_token_id: int | None = 128001, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - head_dim: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads + vocab_size: int = 32000 + hidden_size: int = 2560 + intermediate_size: int = 18432 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "relu2" + max_position_embeddings: int = 4096 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 128000 + eos_token_id: int | list[int] | None = 128001 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + mlp_bias: bool = False + head_dim: int | None = None - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + def __post_init__(self, **kwargs): + if self.head_dim is None: + self.head_dim = self.hidden_size // self.num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters + super().__post_init__(**kwargs) - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) __all__ = ["ArceeConfig"] diff --git a/src/transformers/models/arcee/modular_arcee.py b/src/transformers/models/arcee/modular_arcee.py index 0d49eb2ba8ca..072a863f2f06 100644 --- a/src/transformers/models/arcee/modular_arcee.py +++ b/src/transformers/models/arcee/modular_arcee.py @@ -13,6 +13,8 @@ # limitations under the License. """PyTorch Arcee model.""" +from huggingface_hub.dataclasses import strict + from transformers.utils import auto_docstring, logging from ...modeling_rope_utils import RopeParameters @@ -30,6 +32,7 @@ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B") +@strict(accept_kwargs=True) class ArceeConfig(LlamaConfig): r""" ```python @@ -55,55 +58,28 @@ class ArceeConfig(LlamaConfig): "layers.*.mlp.down_proj": "rowwise", } - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 2560, - intermediate_size: int | None = 18432, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "relu2", - max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 128000, - eos_token_id: int | None = 128001, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - head_dim: int | None = None, - **kwargs, - ): - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - mlp_bias=mlp_bias, - head_dim=head_dim, - **kwargs, - ) - - del self.pretraining_tp + vocab_size: int = 32000 + hidden_size: int = 2560 + intermediate_size: int = 18432 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "relu2" + max_position_embeddings: int = 4096 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 128000 + eos_token_id: int | list[int] | None = 128001 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + mlp_bias: bool = False + head_dim: int | None = None + + pretraining_tp = AttributeError() class ArceeMLP(NemotronMLP): diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index fcc890359644..32f9ca6a4f25 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -17,13 +17,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring +from ...utils.type_validators import interval from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="rhymes-ai/Aria") +@strict(accept_kwargs=True) class AriaTextConfig(PreTrainedConfig): r""" moe_num_experts (`int`, *optional*, defaults to 8): @@ -50,71 +54,53 @@ class AriaTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + + vocab_size: int = 32000 + hidden_size: int = 4096 + + intermediate_size: int = 4096 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = interval(min=0.0, max=1.0)(default=0.02) + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 2 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + pretraining_tp: int | None = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + mlp_bias: bool = False + head_dim: int | None = None base_config_key = "text_config" + moe_num_experts: int = 8 + moe_topk: int = 2 + moe_num_shared_experts: int = 2 + + def __post_init__(self, **kwargs): + if self.head_dim is None: + self.head_dim = self.hidden_size // self.num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int = 4096, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id=2, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - pretraining_tp: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - head_dim: int | None = None, - moe_num_experts: int = 8, - moe_topk: int = 2, - moe_num_shared_experts: int = 2, - **kwargs, - ): - self.intermediate_size = intermediate_size - self.moe_num_experts = moe_num_experts - self.moe_topk = moe_topk - self.moe_num_shared_experts = moe_num_shared_experts - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) @auto_docstring(checkpoint="rhymes-ai/Aria") +@strict(accept_kwargs=True) class AriaConfig(PreTrainedConfig): r""" projector_patch_to_query_dict (`dict`, *optional*): @@ -127,47 +113,37 @@ class AriaConfig(PreTrainedConfig): } sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - vision_feature_layer: int = -1, - text_config: AriaTextConfig = None, - projector_patch_to_query_dict: dict | None = None, - image_token_index: int | None = 9, - initializer_range: float | None = 0.02, - tie_word_embeddings: bool | None = False, - **kwargs, - ): - self.image_token_index = image_token_index + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | AriaTextConfig | None = None + vision_feature_layer: int | list[int] = -1 + projector_patch_to_query_dict: dict | None = None + image_token_index: int = 9 + initializer_range: float = 0.02 + tie_word_embeddings: bool = False + def __post_init__(self, **kwargs): # Convert the keys and values of projector_patch_to_query_dict to integers # This ensures consistency even if they were provided as strings - if projector_patch_to_query_dict is None: - projector_patch_to_query_dict = { + if self.projector_patch_to_query_dict is None: + self.projector_patch_to_query_dict = { 1225: 128, 4900: 256, } - self.projector_patch_to_query_dict = {int(k): int(v) for k, v in projector_patch_to_query_dict.items()} + self.projector_patch_to_query_dict = {int(k): int(v) for k, v in self.projector_patch_to_query_dict.items()} self.max_value_projector_patch_to_query_dict = max(self.projector_patch_to_query_dict.values()) - self.vision_feature_layer = vision_feature_layer - if isinstance(vision_config, dict): - vision_config["model_type"] = "idefics3_vision" - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["idefics3_vision"]() - - self.vision_config = vision_config - self.initializer_range = initializer_range - if isinstance(text_config, dict) and "model_type" in text_config: - text_config = AriaTextConfig(**text_config) - elif text_config is None: - text_config = AriaTextConfig() + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = "idefics3_vision" + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["idefics3_vision"]() - self.text_config = text_config - self.tie_word_embeddings = tie_word_embeddings + if isinstance(self.text_config, dict) and "model_type" in self.text_config: + self.text_config = AriaTextConfig(**self.text_config) + elif self.text_config is None: + self.text_config = AriaTextConfig() - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["AriaConfig", "AriaTextConfig"] diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index 669cd83d5d69..0abd943314bc 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -911,7 +911,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, pixel_mask: torch.FloatTensor | None = None, - vision_feature_layer: int = -1, + vision_feature_layer: int | list[int] = -1, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -1059,7 +1059,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, pixel_mask: torch.FloatTensor | None = None, - vision_feature_layer: int = -1, + vision_feature_layer: int | list[int] = -1, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: return self.model.get_image_features( diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 877e14c7af3e..c9452debd6cf 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -15,6 +15,7 @@ import numpy as np import torch +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -96,6 +97,7 @@ def sequential_experts_gemm(token_states, expert_weights, tokens_per_expert): @auto_docstring(checkpoint="rhymes-ai/Aria") +@strict(accept_kwargs=True) class AriaTextConfig(LlamaConfig): r""" moe_num_experts (`int`, *optional*, defaults to 8): @@ -118,23 +120,15 @@ class AriaTextConfig(LlamaConfig): "layers.*.mlp.shared_experts.down_proj": "rowwise", } - def __init__( - self, - intermediate_size: int = 4096, - moe_num_experts: int = 8, - moe_topk: int = 2, - moe_num_shared_experts: int = 2, - pad_token_id=2, - **super_kwargs, - ): - self.intermediate_size = intermediate_size - self.moe_num_experts = moe_num_experts - self.moe_topk = moe_topk - self.moe_num_shared_experts = moe_num_shared_experts - super().__init__(pad_token_id=pad_token_id, **super_kwargs) + intermediate_size: int = 4096 + moe_num_experts: int = 8 + moe_topk: int = 2 + moe_num_shared_experts: int = 2 + pad_token_id: int | None = 2 @auto_docstring(checkpoint="rhymes-ai/Aria") +@strict(accept_kwargs=True) class AriaConfig(PreTrainedConfig): r""" projector_patch_to_query_dict (`dict`, *optional*): @@ -147,47 +141,37 @@ class AriaConfig(PreTrainedConfig): } sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - vision_feature_layer: int = -1, - text_config: AriaTextConfig = None, - projector_patch_to_query_dict: dict | None = None, - image_token_index: int | None = 9, - initializer_range: float | None = 0.02, - tie_word_embeddings: bool | None = False, - **kwargs, - ): - self.image_token_index = image_token_index + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | AriaTextConfig | None = None + vision_feature_layer: int | list[int] = -1 + projector_patch_to_query_dict: dict | None = None + image_token_index: int = 9 + initializer_range: float = 0.02 + tie_word_embeddings: bool = False + def __post_init__(self, **kwargs): # Convert the keys and values of projector_patch_to_query_dict to integers # This ensures consistency even if they were provided as strings - if projector_patch_to_query_dict is None: - projector_patch_to_query_dict = { + if self.projector_patch_to_query_dict is None: + self.projector_patch_to_query_dict = { 1225: 128, 4900: 256, } - self.projector_patch_to_query_dict = {int(k): int(v) for k, v in projector_patch_to_query_dict.items()} + self.projector_patch_to_query_dict = {int(k): int(v) for k, v in self.projector_patch_to_query_dict.items()} self.max_value_projector_patch_to_query_dict = max(self.projector_patch_to_query_dict.values()) - self.vision_feature_layer = vision_feature_layer - if isinstance(vision_config, dict): - vision_config["model_type"] = "idefics3_vision" - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["idefics3_vision"]() - self.vision_config = vision_config - self.initializer_range = initializer_range + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = "idefics3_vision" + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["idefics3_vision"]() - if isinstance(text_config, dict) and "model_type" in text_config: - text_config = AriaTextConfig(**text_config) - elif text_config is None: - text_config = AriaTextConfig() + if isinstance(self.text_config, dict) and "model_type" in self.text_config: + self.text_config = AriaTextConfig(**self.text_config) + elif self.text_config is None: + self.text_config = AriaTextConfig() - self.text_config = text_config - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) class AriaTextRMSNorm(LlamaRMSNorm): @@ -1182,7 +1166,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, pixel_mask: torch.FloatTensor | None = None, - vision_feature_layer: int = -1, + vision_feature_layer: int | list[int] = -1, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -1267,7 +1251,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, pixel_mask: torch.FloatTensor | None = None, - vision_feature_layer: int = -1, + vision_feature_layer: int | list[int] = -1, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: return self.model.get_image_features( diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py index e7a73026a731..aa04d4b057dc 100644 --- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py @@ -13,16 +13,14 @@ # limitations under the License. """Audio Spectogram Transformer (AST) model configuration""" -from typing import Any +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="MIT/ast-finetuned-audioset-10-10-0.4593") +@strict(accept_kwargs=True) class ASTConfig(PreTrainedConfig): r""" frequency_stride (`int`, *optional*, defaults to 10): @@ -49,48 +47,21 @@ class ASTConfig(PreTrainedConfig): model_type = "audio-spectrogram-transformer" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - patch_size=16, - qkv_bias=True, - frequency_stride=10, - time_stride=10, - max_length=1024, - num_mel_bins=128, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.patch_size = patch_size - self.qkv_bias = qkv_bias - self.frequency_stride = frequency_stride - self.time_stride = time_stride - self.max_length = max_length - self.num_mel_bins = num_mel_bins - - # Overwritten from the parent class: AST is not compatible with `generate`, but has a config parameter sharing the - # same name (`max_length`). Sharing the same name triggers checks regarding the config -> generation_config - # generative parameters deprecation cycle, overwriting this function prevents this from happening. - def _get_non_default_generation_parameters(self) -> dict[str, Any]: - return {} + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + patch_size: int | list[int] | tuple[int, int] = 16 + qkv_bias: bool = True + frequency_stride: int = 10 + time_stride: int = 10 + max_length: int = 1024 + num_mel_bins: int = 128 __all__ = ["ASTConfig"] diff --git a/src/transformers/models/audioflamingo3/configuration_audioflamingo3.py b/src/transformers/models/audioflamingo3/configuration_audioflamingo3.py index 7ccdf589b829..1bb396c5dda7 100644 --- a/src/transformers/models/audioflamingo3/configuration_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/configuration_audioflamingo3.py @@ -13,16 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PretrainedConfig -from ...utils import auto_docstring, logging -from ..auto import CONFIG_MAPPING, AutoConfig +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring +from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="nvidia/audio-flamingo-3-hf") -class AudioFlamingo3EncoderConfig(PretrainedConfig): +@strict(accept_kwargs=True) +class AudioFlamingo3EncoderConfig(PreTrainedConfig): r""" max_source_positions (`int`, *optional*, defaults to 1500): The maximum sequence length of log-mel filter-bank features that this model might ever be used with. @@ -52,43 +53,24 @@ class AudioFlamingo3EncoderConfig(PretrainedConfig): "encoder_layerdrop": "layerdrop", } - def __init__( - self, - num_mel_bins=128, - num_hidden_layers=32, - num_attention_heads=20, - intermediate_size=5120, - layerdrop=0.0, - activation_function="gelu", - hidden_size=1280, - dropout=0.0, - attention_dropout=0.0, - activation_dropout=0.0, - initializer_range=0.02, - scale_embedding=False, - max_source_positions=1500, - **kwargs, - ): - super().__init__(**kwargs) - - self.num_mel_bins = num_mel_bins - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.initializer_range = initializer_range - self.layerdrop = layerdrop - self.num_hidden_layers = num_hidden_layers - self.scale_embedding = scale_embedding - self.max_source_positions = max_source_positions + num_mel_bins: int = 128 + num_hidden_layers: int = 32 + num_attention_heads: int = 20 + intermediate_size: int = 5120 + layerdrop: float | int = 0.0 + activation_function: str = "gelu" + hidden_size: int = 1280 + dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + initializer_range: float = 0.02 + scale_embedding: bool = False + max_source_positions: int = 1500 @auto_docstring(checkpoint="nvidia/audio-flamingo-3-hf") -class AudioFlamingo3Config(PretrainedConfig): +@strict(accept_kwargs=True) +class AudioFlamingo3Config(PreTrainedConfig): r""" Example: @@ -116,37 +98,26 @@ class AudioFlamingo3Config(PretrainedConfig): "audio_config": AudioFlamingo3EncoderConfig, "text_config": AutoConfig, } - - def __init__( - self, - audio_config=None, - text_config=None, - audio_token_id=151669, - projector_hidden_act="gelu", - projector_bias=True, - **kwargs, - ): - self.audio_token_id = audio_token_id - - if isinstance(audio_config, dict): - audio_config["model_type"] = audio_config.get("model_type", "audioflamingo3_encoder") - audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config) - elif audio_config is None: - audio_config = CONFIG_MAPPING["audioflamingo3_encoder"]() - - self.audio_config = audio_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "qwen2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["qwen2"]() - - self.text_config = text_config - self.projector_hidden_act = projector_hidden_act - self.projector_bias = projector_bias - - super().__init__(**kwargs) + audio_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + audio_token_id: int = 151669 + projector_hidden_act: str = "gelu" + projector_bias: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.audio_config, dict): + self.audio_config["model_type"] = self.audio_config.get("model_type", "audioflamingo3_encoder") + self.audio_config = CONFIG_MAPPING[self.audio_config["model_type"]](**self.audio_config) + elif self.audio_config is None: + self.audio_config = CONFIG_MAPPING["audioflamingo3_encoder"]() + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["qwen2"]() + + super().__post_init__(**kwargs) __all__ = ["AudioFlamingo3Config", "AudioFlamingo3EncoderConfig"] diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 35693d4c082e..0f7ed094c1ea 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -655,7 +655,7 @@ def from_pretrained( config = AutoConfig.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs ) - except Exception: + except (ValueError, OSError): config = PreTrainedConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) config_model_type = config.model_type diff --git a/src/transformers/models/autoformer/configuration_autoformer.py b/src/transformers/models/autoformer/configuration_autoformer.py index 665e61fa6556..efb311c08f9a 100644 --- a/src/transformers/models/autoformer/configuration_autoformer.py +++ b/src/transformers/models/autoformer/configuration_autoformer.py @@ -13,14 +13,14 @@ # limitations under the License. """Autoformer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="huggingface/autoformer-tourism-monthly") +@strict(accept_kwargs=True) class AutoformerConfig(PreTrainedConfig): r""" prediction_length (`int`): @@ -87,101 +87,53 @@ class AutoformerConfig(PreTrainedConfig): "num_hidden_layers": "encoder_layers", } - def __init__( - self, - prediction_length: int | None = None, - context_length: int | None = None, - distribution_output: str = "student_t", - loss: str = "nll", - input_size: int = 1, - lags_sequence: list[int] = [1, 2, 3, 4, 5, 6, 7], - scaling: bool = True, - num_time_features: int = 0, - num_dynamic_real_features: int = 0, - num_static_categorical_features: int = 0, - num_static_real_features: int = 0, - cardinality: list[int] | None = None, - embedding_dimension: list[int] | None = None, - d_model: int = 64, - encoder_attention_heads: int = 2, - decoder_attention_heads: int = 2, - encoder_layers: int = 2, - decoder_layers: int = 2, - encoder_ffn_dim: int = 32, - decoder_ffn_dim: int = 32, - activation_function: str = "gelu", - dropout: float = 0.1, - encoder_layerdrop: float = 0.1, - decoder_layerdrop: float = 0.1, - attention_dropout: float = 0.1, - activation_dropout: float = 0.1, - num_parallel_samples: int = 100, - init_std: float = 0.02, - use_cache: bool = True, - is_encoder_decoder=True, - # Autoformer arguments - label_length: int = 10, - moving_average: int = 25, - autocorrelation_factor: int = 3, - **kwargs, - ): - # time series specific configuration - self.prediction_length = prediction_length - self.context_length = context_length if context_length is not None else prediction_length - self.distribution_output = distribution_output - self.loss = loss - self.input_size = input_size - self.num_time_features = num_time_features - self.lags_sequence = lags_sequence - self.scaling = scaling - self.num_dynamic_real_features = num_dynamic_real_features - self.num_static_real_features = num_static_real_features - self.num_static_categorical_features = num_static_categorical_features - if cardinality is not None and num_static_categorical_features > 0: - if len(cardinality) != num_static_categorical_features: - raise ValueError( - "The cardinality should be a list of the same length as `num_static_categorical_features`" - ) - self.cardinality = cardinality - else: + prediction_length: int | None = None + context_length: int | None = None + distribution_output: str = "student_t" + loss: str = "nll" + input_size: int = 1 + lags_sequence: list[int] | tuple[int, ...] = (1, 2, 3, 4, 5, 6, 7) + scaling: bool | str = True + num_time_features: int = 0 + num_dynamic_real_features: int = 0 + num_static_categorical_features: int = 0 + num_static_real_features: int = 0 + cardinality: list[int] | None = None + embedding_dimension: list[int] | None = None + d_model: int = 64 + encoder_attention_heads: int = 2 + decoder_attention_heads: int = 2 + encoder_layers: int = 2 + decoder_layers: int = 2 + encoder_ffn_dim: int = 32 + decoder_ffn_dim: int = 32 + activation_function: str = "gelu" + dropout: float | int = 0.1 + encoder_layerdrop: float | int = 0.1 + decoder_layerdrop: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + num_parallel_samples: int = 100 + init_std: float = 0.02 + use_cache: bool = True + is_encoder_decoder: bool = True + label_length: int = 10 + moving_average: int = 25 + autocorrelation_factor: int = 3 + + def __post_init__(self, **kwargs): + self.context_length = self.context_length if self.context_length is not None else self.prediction_length + self.lags_sequence = list(self.lags_sequence) + + if not (self.cardinality is not None and self.num_static_categorical_features > 0): self.cardinality = [0] - if embedding_dimension is not None and num_static_categorical_features > 0: - if len(embedding_dimension) != num_static_categorical_features: - raise ValueError( - "The embedding dimension should be a list of the same length as `num_static_categorical_features`" - ) - self.embedding_dimension = embedding_dimension - else: + + if not (self.embedding_dimension is not None and self.num_static_categorical_features > 0): self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality] - self.num_parallel_samples = num_parallel_samples # Transformer architecture configuration - self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features - self.d_model = d_model - self.encoder_attention_heads = encoder_attention_heads - self.decoder_attention_heads = decoder_attention_heads - self.encoder_ffn_dim = encoder_ffn_dim - self.decoder_ffn_dim = decoder_ffn_dim - self.encoder_layers = encoder_layers - self.decoder_layers = decoder_layers - - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - - self.activation_function = activation_function - self.init_std = init_std - - self.use_cache = use_cache - - # Autoformer - self.label_length = label_length - self.moving_average = moving_average - self.autocorrelation_factor = autocorrelation_factor - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + self.feature_size = self.input_size * len(self.lags_sequence) + self._number_of_features + super().__post_init__(**kwargs) @property def _number_of_features(self) -> int: @@ -193,5 +145,25 @@ def _number_of_features(self) -> int: + self.input_size * 2 # the log1p(abs(loc)) and log(scale) features ) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if ( + self.cardinality is not None + and self.num_static_categorical_features > 0 + and len(self.cardinality) != self.num_static_categorical_features + ): + raise ValueError( + "The cardinality should be a list of the same length as `num_static_categorical_features`" + ) + + if ( + self.embedding_dimension is not None + and self.num_static_categorical_features > 0 + and len(self.embedding_dimension) != self.num_static_categorical_features + ): + raise ValueError( + "The embedding dimension should be a list of the same length as `num_static_categorical_features`" + ) + __all__ = ["AutoformerConfig"] diff --git a/src/transformers/models/aya_vision/configuration_aya_vision.py b/src/transformers/models/aya_vision/configuration_aya_vision.py index c6f211abef38..723ba45e7ab4 100644 --- a/src/transformers/models/aya_vision/configuration_aya_vision.py +++ b/src/transformers/models/aya_vision/configuration_aya_vision.py @@ -13,15 +13,15 @@ # limitations under the License. """AyaVision model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="CohereForAI/aya-vision-8b") +@strict(accept_kwargs=True) class AyaVisionConfig(PreTrainedConfig): r""" downsample_factor (`int`, *optional*, defaults to 2): @@ -36,36 +36,21 @@ class AyaVisionConfig(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - text_config=None, - vision_feature_select_strategy="full", - vision_feature_layer=-1, - downsample_factor=2, - adapter_layer_norm_eps=1e-6, - image_token_index=255036, - tie_word_embeddings=True, - **kwargs, - ): - self.image_token_index = image_token_index - self.downsample_factor = downsample_factor - self.adapter_layer_norm_eps = adapter_layer_norm_eps - self.tie_word_embeddings = tie_word_embeddings - if vision_feature_select_strategy not in ["default", "full"]: - raise ValueError( - "vision_feature_select_strategy should be one of 'default', 'full'." - f"Got: {vision_feature_select_strategy}" - ) + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + vision_feature_select_strategy: str = "full" + vision_feature_layer: int | list[int] = -1 + downsample_factor: int = 2 + adapter_layer_norm_eps: float = 1e-6 + image_token_index: int = 255036 + tie_word_embeddings: bool = True - self.vision_feature_select_strategy = vision_feature_select_strategy - self.vision_feature_layer = vision_feature_layer - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["siglip_vision_model"]( + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "siglip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["siglip_vision_model"]( hidden_size=1152, intermediate_size=4304, patch_size=14, @@ -75,17 +60,21 @@ def __init__( vision_use_head=False, ) - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "cohere2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["cohere2"]() + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "cohere2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["cohere2"]() - self.text_config = text_config + super().__post_init__(**kwargs) - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.vision_feature_select_strategy not in ["default", "full"]: + raise ValueError( + "vision_feature_select_strategy should be one of 'default', 'full'." + f"Got: {self.vision_feature_select_strategy}" + ) __all__ = ["AyaVisionConfig"] diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py index 3df6f78c2f1d..b4ce654782b7 100644 --- a/src/transformers/models/aya_vision/modeling_aya_vision.py +++ b/src/transformers/models/aya_vision/modeling_aya_vision.py @@ -183,7 +183,7 @@ def set_input_embeddings(self, value): def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -248,7 +248,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -323,7 +323,7 @@ def get_output_embeddings(self) -> nn.Module: def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -344,7 +344,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, labels: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, diff --git a/src/transformers/models/aya_vision/modular_aya_vision.py b/src/transformers/models/aya_vision/modular_aya_vision.py index 75b9e5754207..d1ab6548b4fa 100644 --- a/src/transformers/models/aya_vision/modular_aya_vision.py +++ b/src/transformers/models/aya_vision/modular_aya_vision.py @@ -108,7 +108,7 @@ class AyaVisionModel(LlavaModel): def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -149,7 +149,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -200,7 +200,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, labels: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index 99225ae411f2..d9b88cb8c95d 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -13,14 +13,14 @@ # limitations under the License. """Bamba model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring +@strict(accept_kwargs=True) @auto_docstring( custom_intro=""" The BambaModel is a hybrid [mamba2](https://github.com/state-spaces/mamba) architecture with SwiGLU. @@ -45,99 +45,54 @@ class BambaConfig(PreTrainedConfig): model_type = "bamba" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 128000, - tie_word_embeddings: bool | None = False, - hidden_size: int | None = 4096, - intermediate_size: int | None = 14336, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - num_logits_to_keep: int | None = 1, - pad_token_id: int | None = 0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - max_position_embeddings: int | None = 262144, - attention_dropout: float | None = 0.0, - attn_layer_indices: list[int] | None = None, - mamba_n_heads: int | None = 128, - mamba_d_head: str | None = "auto", - mamba_n_groups: int | None = 1, - mamba_d_state: int | None = 256, - mamba_d_conv: int | None = 4, - mamba_expand: int | None = 2, - mamba_chunk_size: int | None = 256, - mamba_conv_bias: bool | None = True, - mamba_proj_bias: bool | None = False, - time_step_min: float | None = 0.001, - time_step_max: float | None = 0.1, - time_step_limit: tuple[float, float] | None = (0.0, float("inf")), - z_loss_coefficient: float | None = 0.0, - rope_parameters: RopeParameters | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.tie_word_embeddings = tie_word_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.attention_dropout = attention_dropout - self.attention_bias = False - self.mlp_bias = False - + vocab_size: int = 128000 + tie_word_embeddings: bool = False + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 8 + hidden_act: str = "silu" + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + num_logits_to_keep: int | None = 1 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + max_position_embeddings: int = 262144 + attention_dropout: float | int | None = 0.0 + attn_layer_indices: list[int] | None = None + mamba_n_heads: int | None = 128 + mamba_d_head: str | int | None = "auto" + mamba_n_groups: int | None = 1 + mamba_d_state: int | None = 256 + mamba_d_conv: int | None = 4 + mamba_expand: int | None = 2 + mamba_chunk_size: int | None = 256 + mamba_conv_bias: bool | None = True + mamba_proj_bias: bool | None = False + time_step_min: float | None = 0.001 + time_step_max: float | None = 0.1 + time_step_limit: list[float] | tuple[float, float] | None = (0.0, float("inf")) + z_loss_coefficient: float | None = 0.0 + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + mlp_bias: bool = False + + def __post_init__(self, **kwargs): # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - - self.use_cache = use_cache - self.num_logits_to_keep = num_logits_to_keep - - self.attn_layer_indices = attn_layer_indices - mamba_intermediate = mamba_expand * hidden_size - - if mamba_intermediate % mamba_n_heads != 0: - raise ValueError("mamba_n_heads must divide mamba_expand * hidden_size") + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads # for the mamba_v2, must satisfy the following - if mamba_d_head == "auto": - mamba_d_head = mamba_intermediate // mamba_n_heads - - if mamba_d_head * mamba_n_heads != mamba_intermediate: - raise ValueError("The dimensions for the Mamba head state do not match the model intermediate_size") + if self.mamba_d_head == "auto": + self.mamba_d_head = self.mamba_expand * self.hidden_size // self.mamba_n_heads - self.mamba_n_heads = mamba_n_heads - self.mamba_d_head = mamba_d_head - self.mamba_n_groups = mamba_n_groups - self.mamba_d_state = mamba_d_state - self.mamba_d_conv = mamba_d_conv - self.mamba_expand = mamba_expand - self.mamba_chunk_size = mamba_chunk_size - self.mamba_conv_bias = mamba_conv_bias - self.mamba_proj_bias = mamba_proj_bias - self.time_step_min = time_step_min - self.time_step_max = time_step_max - self.time_step_limit = tuple(time_step_limit) if time_step_limit is not None else None - self.z_loss_coefficient = z_loss_coefficient - self.rope_parameters = rope_parameters + self.time_step_limit = tuple(self.time_step_limit) if self.time_step_limit is not None else None kwargs["partial_rotary_factor"] = 0.5 # hardcode for BC - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + super().__post_init__(**kwargs) @property def layers_block_type(self): @@ -146,5 +101,14 @@ def layers_block_type(self): for i in range(self.num_hidden_layers) ] + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + mamba_intermediate = self.mamba_expand * self.hidden_size + if mamba_intermediate % self.mamba_n_heads != 0: + raise ValueError("mamba_n_heads must divide mamba_expand * hidden_size") + + if self.mamba_d_head * self.mamba_n_heads != mamba_intermediate: + raise ValueError("The dimensions for the Mamba head state do not match the model intermediate_size") + __all__ = ["BambaConfig"] diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py index b3981be008c5..d769d322393f 100644 --- a/src/transformers/models/bark/configuration_bark.py +++ b/src/transformers/models/bark/configuration_bark.py @@ -13,6 +13,8 @@ # limitations under the License. """BARK model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -22,6 +24,7 @@ @auto_docstring(checkpoint="suno/bark") +@strict(accept_kwargs=True) class BarkSubModelConfig(PreTrainedConfig): r""" block_size (`int`, *optional*, defaults to 1024): @@ -48,37 +51,36 @@ class BarkSubModelConfig(PreTrainedConfig): "window_size": "block_size", } - def __init__( - self, - block_size=1024, - input_vocab_size=10_048, - output_vocab_size=10_048, - num_layers=12, - num_heads=12, - hidden_size=768, - dropout=0.0, - bias=True, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster - initializer_range=0.02, - use_cache=True, - **kwargs, - ): - self.block_size = block_size - self.input_vocab_size = input_vocab_size - self.output_vocab_size = output_vocab_size - self.num_layers = num_layers - self.num_heads = num_heads - self.hidden_size = hidden_size - self.dropout = dropout - self.bias = bias - self.use_cache = use_cache - self.initializer_range = initializer_range - - super().__init__(**kwargs) + block_size: int = 1024 + input_vocab_size: int = 10_048 + output_vocab_size: int = 10_048 + num_layers: int = 12 + num_heads: int = 12 + hidden_size: int = 768 + dropout: float | int = 0.0 + bias: bool = True + initializer_range: float = 0.02 + use_cache: bool = True @auto_docstring(checkpoint="suno/bark") +@strict(accept_kwargs=True) class BarkSemanticConfig(BarkSubModelConfig): r""" + block_size (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + input_vocab_size (`int`, *optional*, defaults to 10_048): + Vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`{model}`]. Defaults to 10_048 but should be carefully thought with + regards to the chosen sub-model. + output_vocab_size (`int`, *optional*, defaults to 10_048): + Output vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented + by the: `output_ids` when passing forward a [`{model}`]. Defaults to 10_048 but should be carefully thought + with regards to the chosen sub-model. + bias (`bool`, *optional*, defaults to `True`): + Whether or not to use bias in the linear layers and layer norm layers + Example: ```python @@ -99,8 +101,23 @@ class BarkSemanticConfig(BarkSubModelConfig): @auto_docstring(checkpoint="suno/bark") +@strict(accept_kwargs=True) class BarkCoarseConfig(BarkSubModelConfig): r""" + block_size (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + input_vocab_size (`int`, *optional*, defaults to 10_048): + Vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`{model}`]. Defaults to 10_048 but should be carefully thought with + regards to the chosen sub-model. + output_vocab_size (`int`, *optional*, defaults to 10_048): + Output vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented + by the: `output_ids` when passing forward a [`{model}`]. Defaults to 10_048 but should be carefully thought + with regards to the chosen sub-model. + bias (`bool`, *optional*, defaults to `True`): + Whether or not to use bias in the linear layers and layer norm layers + Example: ```python @@ -121,8 +138,22 @@ class BarkCoarseConfig(BarkSubModelConfig): @auto_docstring(checkpoint="suno/bark") +@strict(accept_kwargs=True) class BarkFineConfig(BarkSubModelConfig): r""" + block_size (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + input_vocab_size (`int`, *optional*, defaults to 10_048): + Vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`{model}`]. Defaults to 10_048 but should be carefully thought with + regards to the chosen sub-model. + output_vocab_size (`int`, *optional*, defaults to 10_048): + Output vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented + by the: `output_ids` when passing forward a [`{model}`]. Defaults to 10_048 but should be carefully thought + with regards to the chosen sub-model. + bias (`bool`, *optional*, defaults to `True`): + Whether or not to use bias in the linear layers and layer norm layers n_codes_total (`int`, *optional*, defaults to 8): The total number of audio codebooks predicted. Used in the fine acoustics sub-model. n_codes_given (`int`, *optional*, defaults to 1): @@ -147,15 +178,13 @@ class BarkFineConfig(BarkSubModelConfig): model_type = "fine_acoustics" base_config_key = "fine_acoustics_config" - def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs): - self.n_codes_total = n_codes_total - self.n_codes_given = n_codes_given - - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + tie_word_embeddings: bool = True + n_codes_total: int = 8 + n_codes_given: int = 1 @auto_docstring(checkpoint="suno/bark") +@strict(accept_kwargs=True) class BarkConfig(PreTrainedConfig): r""" semantic_config ([`BarkSemanticConfig`], *optional*): @@ -206,51 +235,41 @@ class BarkConfig(PreTrainedConfig): "fine_acoustics_config": BarkFineConfig, "codec_config": AutoConfig, } - - def __init__( - self, - semantic_config: dict | None = None, - coarse_acoustics_config: dict | None = None, - fine_acoustics_config: dict | None = None, - codec_config: dict | None = None, - initializer_range=0.02, - **kwargs, - ): - if semantic_config is None: - semantic_config = BarkSemanticConfig() + semantic_config: dict | PreTrainedConfig | None = None + coarse_acoustics_config: dict | PreTrainedConfig | None = None + fine_acoustics_config: dict | PreTrainedConfig | None = None + codec_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.semantic_config is None: + self.semantic_config = BarkSemanticConfig() logger.info("`semantic_config` is `None`. Initializing the `BarkSemanticConfig` with default values.") - elif isinstance(semantic_config, dict): - semantic_config = BarkSemanticConfig(**semantic_config) + elif isinstance(self.semantic_config, dict): + self.semantic_config = BarkSemanticConfig(**self.semantic_config) - if coarse_acoustics_config is None: - coarse_acoustics_config = BarkCoarseConfig() + if self.coarse_acoustics_config is None: + self.coarse_acoustics_config = BarkCoarseConfig() logger.info( "`coarse_acoustics_config` is `None`. Initializing the `BarkCoarseConfig` with default values." ) - elif isinstance(coarse_acoustics_config, dict): - coarse_acoustics_config = BarkCoarseConfig(**coarse_acoustics_config) + elif isinstance(self.coarse_acoustics_config, dict): + self.coarse_acoustics_config = BarkCoarseConfig(**self.coarse_acoustics_config) - if fine_acoustics_config is None: - fine_acoustics_config = BarkFineConfig() + if self.fine_acoustics_config is None: + self.fine_acoustics_config = BarkFineConfig() logger.info("`fine_acoustics_config` is `None`. Initializing the `BarkFineConfig` with default values.") - elif isinstance(fine_acoustics_config, dict): - fine_acoustics_config = BarkFineConfig(**fine_acoustics_config) + elif isinstance(self.fine_acoustics_config, dict): + self.fine_acoustics_config = BarkFineConfig(**self.fine_acoustics_config) - if codec_config is None: - codec_config = CONFIG_MAPPING["encodec"]() + if self.codec_config is None: + self.codec_config = CONFIG_MAPPING["encodec"]() logger.info("`codec_config` is `None`. Initializing the `codec_config` with default values.") - elif isinstance(codec_config, dict): - codec_model_type = codec_config.get("model_type", "encodec") - codec_config = CONFIG_MAPPING[codec_model_type](**codec_config) - - self.semantic_config = semantic_config - self.coarse_acoustics_config = coarse_acoustics_config - self.fine_acoustics_config = fine_acoustics_config - self.codec_config = codec_config - - self.initializer_range = initializer_range + elif isinstance(self.codec_config, dict): + codec_model_type = self.codec_config.get("model_type", "encodec") + self.codec_config = CONFIG_MAPPING[codec_model_type](**self.codec_config) - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["BarkCoarseConfig", "BarkConfig", "BarkFineConfig", "BarkSemanticConfig"] diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index a04f5324933d..e688affe2cdd 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -413,7 +413,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict loss = None if labels is not None: @@ -1015,7 +1015,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict loss = None if labels is not None: diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py index c7b97e7422dd..a445de9c6929 100644 --- a/src/transformers/models/bart/configuration_bart.py +++ b/src/transformers/models/bart/configuration_bart.py @@ -13,14 +13,14 @@ # limitations under the License. """BART model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/bart-large") +@strict(accept_kwargs=True) class BartConfig(PreTrainedConfig): r""" Example: @@ -40,72 +40,47 @@ class BartConfig(PreTrainedConfig): model_type = "bart" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", + } + + vocab_size: int = 50265 + max_position_embeddings: int = 1024 + encoder_layers: int | None = 12 + encoder_ffn_dim: int | None = 4096 + encoder_attention_heads: int | None = 16 + decoder_layers: int | None = 12 + decoder_ffn_dim: int | None = 4096 + decoder_attention_heads: int | None = 16 + encoder_layerdrop: float | None = 0.0 + decoder_layerdrop: float | None = 0.0 + activation_function: str | None = "gelu" + d_model: int | None = 1024 + dropout: float | int | None = 0.1 + attention_dropout: float | int | None = 0.0 + activation_dropout: float | int | None = 0.0 + init_std: float | None = 0.02 + classifier_dropout: float | int | None = 0.0 + scale_embedding: bool | None = False + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + is_encoder_decoder: bool | None = True + decoder_start_token_id: int | None = 2 + forced_eos_token_id: int | list[int] | None = 2 + is_decoder: bool | None = False + tie_word_embeddings: bool = True - def __init__( - self, - vocab_size=50265, - max_position_embeddings=1024, - encoder_layers=12, - encoder_ffn_dim=4096, - encoder_attention_heads=16, - decoder_layers=12, - decoder_ffn_dim=4096, - decoder_attention_heads=16, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - activation_function="gelu", - d_model=1024, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - classifier_dropout=0.0, - scale_embedding=False, - use_cache=True, - num_labels=3, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - is_encoder_decoder=True, - decoder_start_token_id=2, - forced_eos_token_id=2, - is_decoder=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.classifier_dropout = classifier_dropout - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + def __post_init__(self, **kwargs): + # Set the default `num_labels` only if `id2label` is not + # yet set, i.e. user didn't pass `id2label/lable2id` in kwargs + if self.id2label is None: + self.num_labels = kwargs.pop("num_labels", 3) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - super().__init__( - num_labels=num_labels, - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) + super().__post_init__(**kwargs) __all__ = ["BartConfig"] diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py index 2917a5e3929e..579789875fd5 100644 --- a/src/transformers/models/beit/configuration_beit.py +++ b/src/transformers/models/beit/configuration_beit.py @@ -13,12 +13,15 @@ # limitations under the License. """BEiT model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/beit-base-patch16-224-pt22k") +@strict(accept_kwargs=True) class BeitConfig(BackboneConfigMixin, PreTrainedConfig): r""" use_mask_token (`bool`, *optional*, defaults to `False`): @@ -66,81 +69,49 @@ class BeitConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "beit" - def __init__( - self, - vocab_size=8192, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - image_size=224, - patch_size=16, - num_channels=3, - use_mask_token=False, - use_absolute_position_embeddings=False, - use_relative_position_bias=False, - use_shared_relative_position_bias=False, - layer_scale_init_value=0.1, - drop_path_rate=0.1, - use_mean_pooling=True, - pool_scales=[1, 2, 3, 6], - use_auxiliary_head=True, - auxiliary_loss_weight=0.4, - auxiliary_channels=256, - auxiliary_num_convs=1, - auxiliary_concat_input=False, - semantic_loss_ignore_index=255, - out_features=None, - out_indices=None, - add_fpn=False, - reshape_hidden_states=True, - **kwargs, - ): - if "segmentation_indices" in kwargs and out_indices is None: - out_indices = kwargs.pop("segmentation_indices") - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.use_mask_token = use_mask_token - self.use_absolute_position_embeddings = use_absolute_position_embeddings - self.use_relative_position_bias = use_relative_position_bias - self.use_shared_relative_position_bias = use_shared_relative_position_bias - self.layer_scale_init_value = layer_scale_init_value - self.drop_path_rate = drop_path_rate - self.use_mean_pooling = use_mean_pooling - # decode head attributes (semantic segmentation) - self.pool_scales = pool_scales - # auxiliary head attributes (semantic segmentation) - self.use_auxiliary_head = use_auxiliary_head - self.auxiliary_loss_weight = auxiliary_loss_weight - self.auxiliary_channels = auxiliary_channels - self.auxiliary_num_convs = auxiliary_num_convs - self.auxiliary_concat_input = auxiliary_concat_input - self.semantic_loss_ignore_index = semantic_loss_ignore_index + vocab_size: int = 8192 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + use_mask_token: bool = False + use_absolute_position_embeddings: bool = False + use_relative_position_bias: bool = False + use_shared_relative_position_bias: bool = False + layer_scale_init_value: float = 0.1 + drop_path_rate: float = 0.1 + use_mean_pooling: bool = True + pool_scales: list[int] | tuple[int, ...] = (1, 2, 3, 6) + use_auxiliary_head: bool = True + auxiliary_loss_weight: float = 0.4 + auxiliary_channels: int = 256 + auxiliary_num_convs: int = 1 + auxiliary_concat_input: bool = False + semantic_loss_ignore_index: int = 255 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + add_fpn: bool = False + reshape_hidden_states: bool = True + + def __post_init__(self, **kwargs): + if "segmentation_indices" in kwargs and kwargs.get("out_indices") is None: + kwargs["out_indices"] = kwargs.pop("segmentation_indices") # backbone attributes self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) - self.add_fpn = add_fpn - self.reshape_hidden_states = reshape_hidden_states + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + + super().__post_init__(**kwargs) __all__ = ["BeitConfig"] diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index a2b40dc42556..0a169bf55a1d 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -735,7 +735,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) resolution = pixel_values.shape[2:] @@ -854,7 +854,7 @@ def forward( >>> list(logits.shape) [1, 196, 8192] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.beit( pixel_values, @@ -922,7 +922,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.beit( pixel_values, output_attentions=output_attentions, @@ -1275,7 +1275,7 @@ def forward( >>> # logits are of shape (batch_size, num_labels, height, width) >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1407,7 +1407,7 @@ def forward( >>> list(feature_maps[-1].shape) [1, 768, 14, 14] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py index 33f277d1dccd..979b9618da8f 100644 --- a/src/transformers/models/bert/configuration_bert.py +++ b/src/transformers/models/bert/configuration_bert.py @@ -14,14 +14,14 @@ # limitations under the License. """BERT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google-bert/bert-base-uncased") +@strict(accept_kwargs=True) class BertConfig(PreTrainedConfig): r""" Examples: @@ -41,52 +41,26 @@ class BertConfig(PreTrainedConfig): model_type = "bert" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - use_cache=True, - classifier_dropout=None, - is_decoder=False, - add_cross_attention=False, - bos_token_id=None, - eos_token_id=None, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + use_cache: bool = True + classifier_dropout: float | int | None = None + is_decoder: bool = False + add_cross_attention: bool = False + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = True __all__ = ["BertConfig"] diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py index 58d13c1e708a..ce0481db12fa 100644 --- a/src/transformers/models/bert_generation/configuration_bert_generation.py +++ b/src/transformers/models/bert_generation/configuration_bert_generation.py @@ -13,11 +13,14 @@ # limitations under the License. """BertGeneration model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="google/bert_for_seq_generation_L-24_bbc_encoder") +@strict(accept_kwargs=True) class BertGenerationConfig(PreTrainedConfig): r""" Examples: @@ -37,48 +40,24 @@ class BertGenerationConfig(PreTrainedConfig): model_type = "bert-generation" - def __init__( - self, - vocab_size=50358, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - intermediate_size=4096, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=2, - eos_token_id=1, - use_cache=True, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache + vocab_size: int = 50358 + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + intermediate_size: int = 4096 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 1 + use_cache: bool = True + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["BertGenerationConfig"] diff --git a/src/transformers/models/big_bird/configuration_big_bird.py b/src/transformers/models/big_bird/configuration_big_bird.py index 820656b81b50..e91c8c8b3ba0 100644 --- a/src/transformers/models/big_bird/configuration_big_bird.py +++ b/src/transformers/models/big_bird/configuration_big_bird.py @@ -13,14 +13,14 @@ # limitations under the License. """BigBird model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/bigbird-roberta-base") +@strict(accept_kwargs=True) class BigBirdConfig(PreTrainedConfig): r""" attention_type (`str`, *optional*, defaults to `"block_sparse"`): @@ -53,65 +53,32 @@ class BigBirdConfig(PreTrainedConfig): model_type = "big_bird" - def __init__( - self, - vocab_size=50358, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu_new", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=4096, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - use_cache=True, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - sep_token_id=66, - attention_type="block_sparse", - use_bias=True, - rescale_embeddings=False, - block_size=64, - num_random_blocks=3, - classifier_dropout=None, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.sep_token_id = sep_token_id - self.tie_word_embeddings = tie_word_embeddings - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.type_vocab_size = type_vocab_size - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - - self.rescale_embeddings = rescale_embeddings - self.attention_type = attention_type - self.use_bias = use_bias - self.block_size = block_size - self.num_random_blocks = num_random_blocks - self.classifier_dropout = classifier_dropout + vocab_size: int = 50358 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu_new" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 4096 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + use_cache: int = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + sep_token_id: int | None = 66 + attention_type: str = "block_sparse" + use_bias: bool = True + rescale_embeddings: bool = False + block_size: int = 64 + num_random_blocks: int = 3 + classifier_dropout: float | int | None = None + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["BigBirdConfig"] diff --git a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py index ab52993e8e15..6d97217bf6fa 100644 --- a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py @@ -13,14 +13,14 @@ # limitations under the License. """BigBirdPegasus model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/bigbird-pegasus-large-arxiv") +@strict(accept_kwargs=True) class BigBirdPegasusConfig(PreTrainedConfig): r""" attention_type (`str`, *optional*, defaults to `"block_sparse"`): @@ -57,76 +57,39 @@ class BigBirdPegasusConfig(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model", "attention_probs_dropout_prob": "attention_dropout", + "num_hidden_layers": "encoder_layers", } - def __init__( - self, - vocab_size=96103, - max_position_embeddings=4096, - encoder_layers=16, - encoder_ffn_dim=4096, - encoder_attention_heads=16, - decoder_layers=16, - decoder_ffn_dim=4096, - decoder_attention_heads=16, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - use_cache=True, - is_encoder_decoder=True, - activation_function="gelu_new", - d_model=1024, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - decoder_start_token_id=2, - classifier_dropout=0.0, - scale_embedding=True, - pad_token_id=0, - bos_token_id=2, - eos_token_id=1, - attention_type="block_sparse", # only for encoder - block_size=64, - num_random_blocks=3, - use_bias=False, - is_decoder=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.classifier_dropout = classifier_dropout - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - - # extra config - self.attention_type = attention_type - self.block_size = block_size - self.num_random_blocks = num_random_blocks - self.use_bias = use_bias - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + vocab_size: int = 96103 + max_position_embeddings: int = 4096 + encoder_layers: int = 16 + encoder_ffn_dim: int = 4096 + encoder_attention_heads: int = 16 + decoder_layers: int = 16 + decoder_ffn_dim: int = 4096 + decoder_attention_heads: int = 16 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "gelu_new" + d_model: int = 1024 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + decoder_start_token_id: int = 2 + classifier_dropout: float | int = 0.0 + scale_embedding: bool = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 1 + attention_type: str = "block_sparse" # only for encoder + block_size: int = 64 + num_random_blocks: int = 3 + use_bias: bool = False + is_decoder: bool = False + tie_word_embeddings: bool = True __all__ = ["BigBirdPegasusConfig"] diff --git a/src/transformers/models/biogpt/configuration_biogpt.py b/src/transformers/models/biogpt/configuration_biogpt.py index 7f8193e0b2b5..2c92e6e76647 100644 --- a/src/transformers/models/biogpt/configuration_biogpt.py +++ b/src/transformers/models/biogpt/configuration_biogpt.py @@ -13,14 +13,14 @@ # limitations under the License. """BioGPT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/biogpt") +@strict(accept_kwargs=True) class BioGptConfig(PreTrainedConfig): r""" Example: @@ -40,49 +40,25 @@ class BioGptConfig(PreTrainedConfig): model_type = "biogpt" - def __init__( - self, - vocab_size=42384, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - intermediate_size=4096, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=1024, - initializer_range=0.02, - layer_norm_eps=1e-12, - scale_embedding=True, - use_cache=True, - layerdrop=0.0, - activation_dropout=0.0, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.scale_embedding = scale_embedding - self.use_cache = use_cache - self.layerdrop = layerdrop - self.activation_dropout = activation_dropout - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vocab_size: int = 42384 + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + intermediate_size: int = 4096 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 1024 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + scale_embedding: bool = True + use_cache: bool = True + layerdrop: float | int = 0.0 + activation_dropout: float | int = 0.0 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = True __all__ = ["BioGptConfig"] diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py index cc1dfed8beb3..8409199a1f18 100644 --- a/src/transformers/models/bit/configuration_bit.py +++ b/src/transformers/models/bit/configuration_bit.py @@ -13,15 +13,15 @@ # limitations under the License. """BiT model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="google/bit-50") +@strict(accept_kwargs=True) class BitConfig(BackboneConfigMixin, PreTrainedConfig): r""" layer_type (`str`, *optional*, defaults to `"preactivation"`): @@ -52,49 +52,44 @@ class BitConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "bit" layer_types = ["preactivation", "bottleneck"] - supported_padding = ["SAME", "VALID"] - - def __init__( - self, - num_channels=3, - embedding_size=64, - hidden_sizes=[256, 512, 1024, 2048], - depths=[3, 4, 6, 3], - layer_type="preactivation", - hidden_act="relu", - global_padding=None, - num_groups=32, - drop_path_rate=0.0, - embedding_dynamic_padding=False, - output_stride=32, - width_factor=1, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) - if layer_type not in self.layer_types: - raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}") - if global_padding is not None: - if global_padding.upper() in self.supported_padding: - global_padding = global_padding.upper() - else: - raise ValueError(f"Padding strategy {global_padding} not supported") - self.num_channels = num_channels - self.embedding_size = embedding_size - self.hidden_sizes = hidden_sizes - self.depths = depths - self.layer_type = layer_type - self.hidden_act = hidden_act - self.global_padding = global_padding - self.num_groups = num_groups - self.drop_path_rate = drop_path_rate - self.embedding_dynamic_padding = embedding_dynamic_padding - self.output_stride = output_stride - self.width_factor = width_factor - - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + supported_padding = [None, "SAME", "VALID"] + + num_channels: int = 3 + embedding_size: int = 64 + hidden_sizes: list[int] | tuple[int, ...] = (256, 512, 1024, 2048) + depths: list[int] | tuple[int, ...] = (3, 4, 6, 3) + layer_type: str = "preactivation" + hidden_act: str = "relu" + global_padding: str | None = None + num_groups: int = 32 + drop_path_rate: float = 0.0 + embedding_dynamic_padding: bool = False + output_stride: int = 32 + width_factor: int = 1 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + + def __post_init__(self, **kwargs): + self.hidden_sizes = list(self.hidden_sizes) + self.depths = list(self.depths) + + if self.global_padding is not None: + self.global_padding = self.global_padding.upper() + + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.layer_type not in self.layer_types: + raise ValueError(f"layer_type={self.layer_type} is not one of {','.join(self.layer_types)}") + + if self.global_padding not in self.supported_padding: + raise ValueError(f"Padding strategy {self.global_padding} not supported") __all__ = ["BitConfig"] diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py index 66b5a62234a5..e2056f057f4a 100644 --- a/src/transformers/models/bit/modeling_bit.py +++ b/src/transformers/models/bit/modeling_bit.py @@ -678,7 +678,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict embedding_output = self.embedder(pixel_values) @@ -735,7 +735,7 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.bit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) @@ -802,7 +802,7 @@ def forward( >>> inputs = processor(image, return_tensors="pt") >>> outputs = model(**inputs) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index 1dfc9998c38b..7a77138cb056 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and """BitNet model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/bitnet-b1.58-2B-4T") +@strict(accept_kwargs=True) class BitNetConfig(PreTrainedConfig): r""" ```python @@ -40,53 +40,31 @@ class BitNetConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] default_theta = 500000.0 - def __init__( - self, - vocab_size: int | None = 128256, - hidden_size: int | None = 2560, - intermediate_size: int | None = 6912, - num_hidden_layers: int | None = 30, - num_attention_heads: int | None = 20, - num_key_value_heads: int | None = 5, - hidden_act: str | None = "relu2", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 128000, - eos_token_id: int | None = 128001, - tie_word_embeddings: bool | None = False, - attention_bias: bool | None = False, - attention_dropout: str | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads + vocab_size: int = 128256 + hidden_size: int = 2560 + intermediate_size: int = 6912 + num_hidden_layers: int = 30 + num_attention_heads: int = 20 + num_key_value_heads: int | None = 5 + hidden_act: str = "relu2" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 128000 + eos_token_id: int | list[int] | None = 128001 + tie_word_embeddings: bool = False + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + rope_parameters: RopeParameters | dict | None = None + def __post_init__(self, **kwargs): # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["BitNetConfig"] diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py index 9c350ff0f274..2d97047e45f3 100644 --- a/src/transformers/models/blenderbot/configuration_blenderbot.py +++ b/src/transformers/models/blenderbot/configuration_blenderbot.py @@ -13,14 +13,14 @@ # limitations under the License. """Blenderbot model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/blenderbot-3B") +@strict(accept_kwargs=True) class BlenderbotConfig(PreTrainedConfig): r""" encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 3): @@ -43,71 +43,39 @@ class BlenderbotConfig(PreTrainedConfig): model_type = "blenderbot" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} - - def __init__( - self, - vocab_size=8008, - max_position_embeddings=128, - encoder_layers=2, - encoder_ffn_dim=10240, - encoder_attention_heads=32, - decoder_layers=24, - decoder_ffn_dim=10240, - decoder_attention_heads=32, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - use_cache=True, - is_encoder_decoder=True, - activation_function="gelu", - d_model=2560, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - decoder_start_token_id=1, - scale_embedding=False, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - encoder_no_repeat_ngram_size=3, - forced_eos_token_id=2, - is_decoder=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", + } - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - super().__init__( - is_encoder_decoder=is_encoder_decoder, - encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) + vocab_size: int = 8008 + max_position_embeddings: int = 128 + encoder_layers: int = 2 + encoder_ffn_dim: int = 10240 + encoder_attention_heads: int = 32 + decoder_layers: int = 24 + decoder_ffn_dim: int = 10240 + decoder_attention_heads: int = 32 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "gelu" + d_model: int = 2560 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + decoder_start_token_id: int = 1 + scale_embedding: bool = False + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + encoder_no_repeat_ngram_size: int = 3 + forced_eos_token_id: int | None = 2 + is_decoder: bool = False + tie_word_embeddings: bool = True __all__ = ["BlenderbotConfig"] diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py index 71463fcd63ee..b8a0525a8d5d 100644 --- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py @@ -13,14 +13,14 @@ # limitations under the License. """BlenderbotSmall model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/blenderbot_small-90M") +@strict(accept_kwargs=True) class BlenderbotSmallConfig(PreTrainedConfig): r""" Example: @@ -40,69 +40,38 @@ class BlenderbotSmallConfig(PreTrainedConfig): model_type = "blenderbot-small" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} - - def __init__( - self, - vocab_size=50265, - max_position_embeddings=512, - encoder_layers=8, - encoder_ffn_dim=2048, - encoder_attention_heads=16, - decoder_layers=8, - decoder_ffn_dim=2048, - decoder_attention_heads=16, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - use_cache=True, - is_encoder_decoder=True, - activation_function="gelu", - d_model=512, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - decoder_start_token_id=1, - scale_embedding=False, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - forced_eos_token_id=2, - is_decoder=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", + } - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - super().__init__( - is_encoder_decoder=is_encoder_decoder, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) + vocab_size: int = 50265 + max_position_embeddings: int = 512 + encoder_layers: int = 8 + encoder_ffn_dim: int = 2048 + encoder_attention_heads: int = 16 + decoder_layers: int = 8 + decoder_ffn_dim: int = 2048 + decoder_attention_heads: int = 16 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "gelu" + d_model: int = 512 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + decoder_start_token_id: int | None = 1 + scale_embedding: bool = False + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + forced_eos_token_id: int | None = 2 + is_decoder: bool = False + tie_word_embeddings: bool = True __all__ = ["BlenderbotSmallConfig"] diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py index 9cf2f3c673c7..b24a935723fc 100644 --- a/src/transformers/models/blip/configuration_blip.py +++ b/src/transformers/models/blip/configuration_blip.py @@ -13,6 +13,8 @@ # limitations under the License. """Blip model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="Salesforce/blip-vqa-base") +@strict(accept_kwargs=True) class BlipTextConfig(PreTrainedConfig): r""" label_smoothing (float, *optional*): @@ -46,55 +49,30 @@ class BlipTextConfig(PreTrainedConfig): model_type = "blip_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=30524, - hidden_size=768, - encoder_hidden_size=768, - intermediate_size=3072, - projection_dim=768, - num_hidden_layers=12, - num_attention_heads=8, - max_position_embeddings=512, - hidden_act="gelu", - layer_norm_eps=1e-12, - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - bos_token_id=30522, - eos_token_id=2, - pad_token_id=0, - sep_token_id=102, - is_decoder=True, - use_cache=True, - label_smoothing=0.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.sep_token_id = sep_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.encoder_hidden_size = encoder_hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.hidden_dropout_prob = hidden_dropout_prob - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.is_decoder = is_decoder - self.use_cache = use_cache - self.label_smoothing = label_smoothing + vocab_size: int = 30524 + hidden_size: int = 768 + encoder_hidden_size: int = 768 + intermediate_size: int = 3072 + projection_dim: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 8 + max_position_embeddings: int = 512 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-12 + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + bos_token_id: int | None = 30522 + eos_token_id: int | list[int] | None = 2 + pad_token_id: int | None = 0 + sep_token_id: int | None = 102 + is_decoder: bool = True + use_cache: bool = True + label_smoothing: float = 0.0 @auto_docstring(checkpoint="Salesforce/blip-vqa-base") +@strict(accept_kwargs=True) class BlipVisionConfig(PreTrainedConfig): r""" Example: @@ -115,37 +93,21 @@ class BlipVisionConfig(PreTrainedConfig): model_type = "blip_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=12, - image_size=384, - patch_size=16, - hidden_act="gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=1e-10, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act + hidden_size: int = 768 + intermediate_size: int = 3072 + projection_dim: int = 512 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + image_size: int | list[int] | tuple[int, int] = 384 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 1e-10 @auto_docstring(checkpoint="Salesforce/blip-vqa-base") +@strict(accept_kwargs=True) class BlipConfig(PreTrainedConfig): r""" label_smoothing (float, *optional*): @@ -181,42 +143,32 @@ class BlipConfig(PreTrainedConfig): model_type = "blip" sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig} - def __init__( - self, - text_config=None, - vision_config=None, - projection_dim=512, - logit_scale_init_value=2.6592, - image_text_hidden_size=256, - label_smoothing=0.0, - tie_word_embeddings=True, - **kwargs, - ): - if text_config is None: - text_config = BlipTextConfig() + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + projection_dim: int = 512 + logit_scale_init_value: float = 2.6592 + image_text_hidden_size: int = 256 + label_smoothing: float = 0.0 + tie_word_embeddings: bool = True + initializer_factor: float = 1.0 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = BlipTextConfig() logger.info("`text_config` is `None`. Initializing the `BlipTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = BlipTextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = BlipTextConfig(**self.text_config) - if vision_config is None: - vision_config = BlipVisionConfig() + if self.vision_config is None: + self.vision_config = BlipVisionConfig() logger.info("`vision_config` is `None`. initializing the `BlipVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = BlipVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config + elif isinstance(self.vision_config, dict): + self.vision_config = BlipVisionConfig(**self.vision_config) self.text_config.encoder_hidden_size = self.vision_config.hidden_size - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = 1.0 - self.initializer_range = 0.02 - self.image_text_hidden_size = image_text_hidden_size - self.label_smoothing = label_smoothing - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["BlipConfig", "BlipTextConfig", "BlipVisionConfig"] diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py index 8e7777df00aa..7fc00254cb4d 100644 --- a/src/transformers/models/blip_2/configuration_blip_2.py +++ b/src/transformers/models/blip_2/configuration_blip_2.py @@ -13,6 +13,8 @@ # limitations under the License. """BLIP-2 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import auto_docstring, logging @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="Salesforce/blip2-opt-2.7b") +@strict(accept_kwargs=True) class Blip2VisionConfig(PreTrainedConfig): r""" Example: @@ -43,37 +46,21 @@ class Blip2VisionConfig(PreTrainedConfig): model_type = "blip_2_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1408, - intermediate_size=6144, - num_hidden_layers=39, - num_attention_heads=16, - image_size=224, - patch_size=14, - hidden_act="gelu", - layer_norm_eps=1e-6, - attention_dropout=0.0, - initializer_range=1e-10, - qkv_bias=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.qkv_bias = qkv_bias + hidden_size: int = 1408 + intermediate_size: int = 6144 + num_hidden_layers: int = 39 + num_attention_heads: int = 16 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 14 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + initializer_range: float = 1e-10 + qkv_bias: bool = True @auto_docstring(checkpoint="Salesforce/blip2-opt-2.7b") +@strict(accept_kwargs=True) class Blip2QFormerConfig(PreTrainedConfig): r""" cross_attention_frequency (`int`, *optional*, defaults to 2): @@ -98,45 +85,25 @@ class Blip2QFormerConfig(PreTrainedConfig): model_type = "blip_2_qformer" base_config_key = "qformer_config" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - cross_attention_frequency=2, - encoder_hidden_size=1408, - use_qformer_text_input=False, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.cross_attention_frequency = cross_attention_frequency - self.encoder_hidden_size = encoder_hidden_size - self.use_qformer_text_input = use_qformer_text_input + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + cross_attention_frequency: int = 2 + encoder_hidden_size: int = 1408 + use_qformer_text_input: bool = False @auto_docstring(checkpoint="Salesforce/blip2-opt-2.7b") +@strict(accept_kwargs=True) class Blip2Config(PreTrainedConfig): r""" qformer_config (`dict`, *optional*): @@ -182,49 +149,39 @@ class Blip2Config(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "qformer_config": Blip2QFormerConfig, "vision_config": Blip2VisionConfig} - def __init__( - self, - vision_config=None, - qformer_config=None, - text_config=None, - num_query_tokens=32, - image_text_hidden_size=256, - image_token_index=None, - **kwargs, - ): - if text_config is None: - text_config = CONFIG_MAPPING["opt"]() + vision_config: dict | PreTrainedConfig | None = None + qformer_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + num_query_tokens: int = 32 + image_text_hidden_size: int = 256 + image_token_index: int | None = None + initializer_factor: float = 1.0 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = CONFIG_MAPPING["opt"]() logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") - elif isinstance(text_config, dict): - text_model_type = text_config.get("model_type", "opt") - text_config = CONFIG_MAPPING[text_model_type](**text_config) + elif isinstance(self.text_config, dict): + text_model_type = self.text_config.get("model_type", "opt") + self.text_config = CONFIG_MAPPING[text_model_type](**self.text_config) - if qformer_config is None: - qformer_config = Blip2QFormerConfig() + if self.qformer_config is None: + self.qformer_config = Blip2QFormerConfig() logger.info("qformer_config is None. Initializing the Blip2QFormerConfig with default values.") - elif isinstance(qformer_config, dict): - qformer_config = Blip2QFormerConfig(**qformer_config) + elif isinstance(self.qformer_config, dict): + self.qformer_config = Blip2QFormerConfig(**self.qformer_config) - if vision_config is None: - vision_config = Blip2VisionConfig() + if self.vision_config is None: + self.vision_config = Blip2VisionConfig() logger.info("`vision_config` is `None`. initializing the `Blip2VisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = Blip2VisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - self.qformer_config = qformer_config + elif isinstance(self.vision_config, dict): + self.vision_config = Blip2VisionConfig(**self.vision_config) - self.num_query_tokens = num_query_tokens - self.image_text_hidden_size = image_text_hidden_size - self.image_token_index = image_token_index self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES - self.initializer_factor = 1.0 - self.initializer_range = 0.02 - kwargs["is_encoder_decoder"] = self.text_config.is_encoder_decoder - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Blip2Config", "Blip2QFormerConfig", "Blip2VisionConfig"] diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 135000bfbe5f..c5c022d39066 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -2036,7 +2036,7 @@ def forward( 44.7% that image 0 is 'a photo of a dog' ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/bloom/configuration_bloom.py b/src/transformers/models/bloom/configuration_bloom.py index cbc362dd7af8..1469bbca3eef 100644 --- a/src/transformers/models/bloom/configuration_bloom.py +++ b/src/transformers/models/bloom/configuration_bloom.py @@ -13,14 +13,14 @@ # limitations under the License. """Bloom configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="bigscience/bloom") +@strict(accept_kwargs=True) class BloomConfig(PreTrainedConfig): r""" apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`): @@ -55,47 +55,28 @@ class BloomConfig(PreTrainedConfig): "num_attention_heads": "n_head", } - def __init__( - self, - vocab_size=250880, - hidden_size=64, - n_layer=2, - n_head=8, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - use_cache=True, - bos_token_id=1, - eos_token_id=2, - pad_token_id=None, - apply_residual_connection_post_layernorm=False, - hidden_dropout=0.0, - attention_dropout=0.0, - pretraining_tp=1, # TP rank used when training with megatron - slow_but_exact=False, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size + vocab_size: int = 250880 + hidden_size: int = 64 + n_layer: int = 2 + n_head: int = 8 + layer_norm_epsilon: float = 1e-5 + initializer_range: float = 0.02 + use_cache: bool = True + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + pad_token_id: int | None = None + apply_residual_connection_post_layernorm: bool = False + hidden_dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + pretraining_tp: int = 1 # TP rank used when training with megatro + slow_but_exact: bool = False + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): # Backward compatibility with n_embed kwarg n_embed = kwargs.pop("n_embed", None) - self.hidden_size = hidden_size if n_embed is None else n_embed - self.n_layer = n_layer - self.n_head = n_head - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.use_cache = use_cache - self.pretraining_tp = pretraining_tp - self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.slow_but_exact = slow_but_exact - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + self.hidden_size = self.hidden_size if n_embed is None else n_embed + super().__post_init__(**kwargs) __all__ = ["BloomConfig"] diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index 65ced8ef5676..dad3dfd0ac13 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -472,7 +472,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -635,7 +635,7 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -730,7 +730,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -854,7 +854,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -929,7 +929,7 @@ def forward( [What are input IDs?](../glossary#input-ids) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py index 20891a0a905a..90e338082371 100644 --- a/src/transformers/models/blt/configuration_blt.py +++ b/src/transformers/models/blt/configuration_blt.py @@ -13,6 +13,8 @@ # limitations under the License. """Blt model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -22,6 +24,7 @@ @auto_docstring(checkpoint="itazap/blt-1b-hf") +@strict(accept_kwargs=True) class BltLocalEncoderConfig(PreTrainedConfig): r""" cross_attn_k (`int`, *optional*, defaults to 2): @@ -35,48 +38,31 @@ class BltLocalEncoderConfig(PreTrainedConfig): model_type = "blt_local_encoder" default_theta = 500000.0 - def __init__( - self, - vocab_size: int | None = 260, - cross_attn_all_layers: bool | None = False, - cross_attn_k: int | None = 2, - hidden_size_global: int | None = 2048, - hidden_size: int | None = 1024, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = None, - num_hidden_layers: int | None = 1, - rms_norm_eps: float | None = 1e-5, - dropout: float | None = 0.0, - max_position_embeddings: int | None = 24576, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - hidden_act: str | None = "silu", - intermediate_size: int | None = 2816, - initializer_range: float | None = 0.02, - **kwargs, - ): - self.vocab_size = vocab_size - self.cross_attn_all_layers = cross_attn_all_layers - self.cross_attn_k = cross_attn_k - self.hidden_size_global = hidden_size_global - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads or num_attention_heads - self.head_dim = hidden_size // num_attention_heads - self.intermediate_size = intermediate_size or int(8 * hidden_size / 3) - self.num_hidden_layers = num_hidden_layers - self.rms_norm_eps = rms_norm_eps - self.dropout = dropout - self.max_position_embeddings = max_position_embeddings - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rope_parameters = rope_parameters - - # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error - kwargs.pop("tie_word_embeddings", None) - super().__init__(**kwargs, tie_word_embeddings=False) + vocab_size: int = 260 + cross_attn_all_layers: bool | None = False + cross_attn_k: int | None = 2 + hidden_size_global: int | None = 2048 + hidden_size: int = 1024 + num_attention_heads: int = 16 + num_key_value_heads: int | None = None + num_hidden_layers: int = 1 + rms_norm_eps: float = 1e-5 + dropout: float | int | None = 0.0 + max_position_embeddings: int = 24576 + rope_parameters: RopeParameters | dict | None = None + hidden_act: str = "silu" + intermediate_size: int | None = None + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + self.num_key_value_heads = self.num_key_value_heads or self.num_attention_heads + self.intermediate_size = self.intermediate_size or int(8 * self.hidden_size / 3) + self.tie_word_embeddings = False + super().__post_init__(**kwargs) @auto_docstring(checkpoint="itazap/blt-1b-hf") +@strict(accept_kwargs=True) class BltLocalDecoderConfig(PreTrainedConfig): r""" cross_attn_k (`int`, *optional*, defaults to 2): @@ -90,130 +76,92 @@ class BltLocalDecoderConfig(PreTrainedConfig): model_type = "blt_local_decoder" default_theta = 500000.0 - def __init__( - self, - vocab_size: int | None = 260, - cross_attn_all_layers: bool | None = True, - cross_attn_k: int | None = 2, - hidden_size_global: int | None = 2048, - hidden_size: int | None = 1024, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = None, - num_hidden_layers: int | None = 9, - rms_norm_eps: float | None = 1e-5, - dropout: float | None = 0.0, - max_position_embeddings: int | None = 24576, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - hidden_act: str | None = "silu", - intermediate_size: int | None = 2816, - initializer_range: float | None = 0.02, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - tie_word_embeddings: bool | None = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.cross_attn_all_layers = cross_attn_all_layers - self.cross_attn_k = cross_attn_k - self.hidden_size_global = hidden_size_global - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads or num_attention_heads - self.head_dim = hidden_size // num_attention_heads - self.intermediate_size = intermediate_size or int(8 * hidden_size / 3) - self.num_hidden_layers = num_hidden_layers - self.rms_norm_eps = rms_norm_eps - self.dropout = dropout - self.max_position_embeddings = max_position_embeddings - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id + vocab_size: int = 260 + cross_attn_all_layers: bool | None = True + cross_attn_k: int | None = 2 + hidden_size_global: int | None = 2048 + hidden_size: int = 1024 + num_attention_heads: int = 16 + num_key_value_heads: int | None = None + num_hidden_layers: int = 9 + rms_norm_eps: float = 1e-5 + dropout: float | int | None = 0.0 + max_position_embeddings: int = 24576 + rope_parameters: RopeParameters | dict | None = None + hidden_act: str = "silu" + intermediate_size: int = 2816 + initializer_range: float = 0.02 + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + self.num_key_value_heads = self.num_key_value_heads or self.num_attention_heads + self.head_dim = self.hidden_size // self.num_attention_heads + self.intermediate_size = self.intermediate_size or int(8 * self.hidden_size / 3) self.tie_word_embeddings = False # Force-set to False for BC - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="itazap/blt-1b-hf") +@strict(accept_kwargs=True) class BltGlobalTransformerConfig(PreTrainedConfig): model_type = "blt_global_transformer" default_theta = 500000.0 - def __init__( - self, - hidden_size: int | None = 2048, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = None, - num_hidden_layers: int | None = 25, - rms_norm_eps: float | None = 1e-5, - dropout: float | None = 0.0, - max_position_embeddings: int | None = 4096, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - hidden_act: str | None = "silu", - intermediate_size: int | None = 5632, - initializer_range: float | None = 0.02, - tie_word_embeddings: bool | None = False, - **kwargs, - ): - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads or num_attention_heads - self.head_dim = hidden_size // num_attention_heads - self.intermediate_size = intermediate_size or int(8 * hidden_size / 3) - self.num_hidden_layers = num_hidden_layers - self.rms_norm_eps = rms_norm_eps - self.dropout = dropout - self.max_position_embeddings = max_position_embeddings - self.hidden_act = hidden_act - self.initializer_range = initializer_range + hidden_size: int = 2048 + num_attention_heads: int = 16 + num_key_value_heads: int | None = None + num_hidden_layers: int = 25 + rms_norm_eps: float = 1e-5 + dropout: float | int | None = 0.0 + max_position_embeddings: int = 4096 + rope_parameters: RopeParameters | dict | None = None + hidden_act: str = "silu" + intermediate_size: int = 5632 + initializer_range: float = 0.02 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + self.num_key_value_heads = self.num_key_value_heads or self.num_attention_heads + self.head_dim = self.hidden_size // self.num_attention_heads + self.intermediate_size = self.intermediate_size or int(8 * self.hidden_size / 3) self.tie_word_embeddings = False - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="itazap/blt-1b-hf") +@strict(accept_kwargs=True) class BltPatcherConfig(PreTrainedConfig): model_type = "blt_patcher" - def __init__( - self, - vocab_size: int | None = 260, - hidden_size: int | None = 768, - num_hidden_layers: int | None = 14, - num_attention_heads: int | None = 12, - num_key_value_heads: int | None = None, - max_position_embeddings: int | None = 8192, - rms_norm_eps: float | None = 1e-5, - dropout: float | None = 0.0, - intermediate_size: int | None = 2048, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - initializer_range: float | None = 0.02, - tie_word_embeddings: bool | None = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = hidden_size // num_attention_heads - self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.rms_norm_eps = rms_norm_eps - self.dropout = dropout + vocab_size: int = 260 + hidden_size: int = 768 + num_hidden_layers: int = 14 + num_attention_heads: int = 12 + num_key_value_heads: int | None = None + max_position_embeddings: int = 8192 + rms_norm_eps: float = 1e-5 + dropout: float | int | None = 0.0 + intermediate_size: int = 2048 + rope_parameters: RopeParameters | dict | None = None + initializer_range: float = 0.02 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + self.num_key_value_heads = self.num_key_value_heads or self.num_attention_heads + self.head_dim = self.hidden_size // self.num_attention_heads + self.intermediate_size = self.intermediate_size or int(8 * self.hidden_size / 3) + self.tie_word_embeddings = False self.hidden_act = "silu" # Blt uses silu activation - self.intermediate_size = intermediate_size or int(8 * self.hidden_size / 3) - self.initializer_range = initializer_range - self.rope_parameters = rope_parameters - self.tie_word_embeddings = False - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="itazap/blt-1b-hf") +@strict(accept_kwargs=True) class BltConfig(PreTrainedConfig): r""" patch_in_forward (`bool`, *optional*, defaults to `True`): @@ -265,93 +213,60 @@ class BltConfig(PreTrainedConfig): "global_config": BltGlobalTransformerConfig, } - def __init__( - self, - vocab_size: int | None = 260, - max_position_embeddings: int | None = 4096, - patch_in_forward: bool | None = True, - patch_size: int | None = 4, - patching_mode: str | None = "entropy", - patching_threshold: float | None = 1.335442066192627, - patching_batch_size: int | None = 1, - max_patch_length: int | None = None, - cross_attn_k: int | None = 2, - encoder_hash_byte_group_size: int | None = None, - encoder_hash_byte_group_vocab: int | None = 500002, - encoder_hash_byte_group_nb_functions: int | None = 1, - patcher_config: dict | None = None, - encoder_config: dict | None = None, - decoder_config: dict | None = None, - global_config: dict | None = None, - tie_word_embeddings: bool | None = False, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - initializer_range: float | None = 0.02, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - # Basic model configuration - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - - # Patching configuration - self.patch_in_forward = patch_in_forward - self.patch_size = patch_size - self.patching_mode = patching_mode - self.patching_threshold = patching_threshold - self.patching_batch_size = patching_batch_size - self.max_patch_length = max_patch_length - self.patching_device = kwargs.get("patching_device", "cuda") - self.realtime_patching = kwargs.get("realtime_patching", True) - self.patching_threshold_add = kwargs.get("patching_threshold_add") - self.monotonicity = kwargs.get("monotonicity", False) - - # Cross attention configurations - self.cross_attn_k = cross_attn_k - - # Encoder configurations - self.encoder_hash_byte_group_size = encoder_hash_byte_group_size or [3, 4, 5, 6, 7, 8] - self.encoder_hash_byte_group_vocab = encoder_hash_byte_group_vocab - self.encoder_hash_byte_group_nb_functions = encoder_hash_byte_group_nb_functions + vocab_size: int = 260 + max_position_embeddings: int = 4096 + patch_in_forward: bool | None = True + patch_size: int | None = 4 + patching_mode: str | None = "entropy" + patching_threshold: float | None = 1.335442066192627 + patching_batch_size: int | None = 1 + max_patch_length: int | None = None + cross_attn_k: int | None = 2 + encoder_hash_byte_group_size: list[int] | None = None + encoder_hash_byte_group_vocab: int | None = 500002 + encoder_hash_byte_group_nb_functions: int | None = 1 + patcher_config: dict | PreTrainedConfig | None = None + encoder_config: dict | PreTrainedConfig | None = None + decoder_config: dict | PreTrainedConfig | None = None + global_config: dict | PreTrainedConfig | None = None + tie_word_embeddings: bool = False + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + initializer_range: float = 0.02 + rope_parameters: RopeParameters | dict | None = None + + def __post_init__(self, **kwargs): + self.encoder_hash_byte_group_size = self.encoder_hash_byte_group_size or [3, 4, 5, 6, 7, 8] # Initialize component configurations - if patcher_config is None: - self.patcher_config = BltPatcherConfig(initializer_range=initializer_range) + if self.patcher_config is None: + self.patcher_config = BltPatcherConfig(initializer_range=self.initializer_range) logger.info("patcher_config is None, using default Blt patcher config") - elif isinstance(patcher_config, dict): - patcher_config.setdefault("initializer_range", initializer_range) - self.patcher_config = BltPatcherConfig(**patcher_config) - elif isinstance(patcher_config, BltPatcherConfig): - self.patcher_config = patcher_config - - if encoder_config is None: - self.encoder_config = BltLocalEncoderConfig(initializer_range=initializer_range) + elif isinstance(self.patcher_config, dict): + self.patcher_config.setdefault("initializer_range", self.initializer_range) + self.patcher_config = BltPatcherConfig(**self.patcher_config) + + if self.encoder_config is None: + self.encoder_config = BltLocalEncoderConfig(initializer_range=self.initializer_range) logger.info("encoder_config is None, using default Blt encoder config") - elif isinstance(encoder_config, dict): - encoder_config.setdefault("initializer_range", initializer_range) - self.encoder_config = BltLocalEncoderConfig(**encoder_config) - elif isinstance(encoder_config, BltLocalEncoderConfig): - self.encoder_config = encoder_config - - if decoder_config is None: - self.decoder_config = BltLocalDecoderConfig(initializer_range=initializer_range) + elif isinstance(self.encoder_config, dict): + self.encoder_config.setdefault("initializer_range", self.initializer_range) + self.encoder_config = BltLocalEncoderConfig(**self.encoder_config) + + if self.decoder_config is None: + self.decoder_config = BltLocalDecoderConfig(initializer_range=self.initializer_range) logger.info("decoder_config is None, using default Blt decoder config") - elif isinstance(decoder_config, dict): - decoder_config.setdefault("initializer_range", initializer_range) - self.decoder_config = BltLocalDecoderConfig(**decoder_config) - elif isinstance(decoder_config, BltLocalDecoderConfig): - self.decoder_config = decoder_config - - if global_config is None: - self.global_config = BltGlobalTransformerConfig(initializer_range=initializer_range) + elif isinstance(self.decoder_config, dict): + self.decoder_config.setdefault("initializer_range", self.initializer_range) + self.decoder_config = BltLocalDecoderConfig(**self.decoder_config) + + if self.global_config is None: + self.global_config = BltGlobalTransformerConfig(initializer_range=self.initializer_range) logger.info("global_config is None, using default Blt global config") - elif isinstance(global_config, dict): - global_config.setdefault("initializer_range", initializer_range) - self.global_config = BltGlobalTransformerConfig(**global_config) - elif isinstance(global_config, BltGlobalTransformerConfig): - self.global_config = global_config + elif isinstance(self.global_config, dict): + self.global_config.setdefault("initializer_range", self.initializer_range) + self.global_config = BltGlobalTransformerConfig(**self.global_config) # Determine if token embedding projection is needed based on dimension mismatch (7b) encoder_cross_output_size = self.encoder_config.hidden_size * self.cross_attn_k @@ -359,13 +274,7 @@ def __init__( encoder_cross_output_size if encoder_cross_output_size != self.global_config.hidden_size else None ) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = [ diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py index 0a3fb33090d3..a181df45947c 100644 --- a/src/transformers/models/bridgetower/configuration_bridgetower.py +++ b/src/transformers/models/bridgetower/configuration_bridgetower.py @@ -13,6 +13,8 @@ # limitations under the License. """BridgeTower model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="BridgeTower/bridgetower-base") +@strict(accept_kwargs=True) class BridgeTowerVisionConfig(PreTrainedConfig): r""" stop_gradient (`bool`, *optional*, defaults to `False`): @@ -45,34 +48,20 @@ class BridgeTowerVisionConfig(PreTrainedConfig): model_type = "bridgetower_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_channels=3, - patch_size=16, - image_size=288, - initializer_factor=1, - layer_norm_eps=1e-05, - stop_gradient=False, - share_layernorm=True, - remove_last_layer=False, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_factor = initializer_factor - self.layer_norm_eps = layer_norm_eps - self.stop_gradient = stop_gradient - self.share_layernorm = share_layernorm - self.remove_last_layer = remove_last_layer + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 16 + image_size: int | list[int] | tuple[int, int] = 288 + initializer_factor: float | int = 1 + layer_norm_eps: float = 1e-05 + stop_gradient: bool = False + share_layernorm: bool = True + remove_last_layer: bool = False @auto_docstring(checkpoint="BridgeTower/bridgetower-base") +@strict(accept_kwargs=True) class BridgeTowerTextConfig(PreTrainedConfig): r""" Example: @@ -90,51 +79,28 @@ class BridgeTowerTextConfig(PreTrainedConfig): model_type = "bridgetower_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=50265, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - initializer_factor=1, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=514, - type_vocab_size=1, - layer_norm_eps=1e-05, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - use_cache=True, - is_decoder=False, - add_cross_attention=False, - **kwargs, - ): - super().__init__(**kwargs) - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.initializer_factor = initializer_factor - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id + vocab_size: int = 50265 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + initializer_factor: float | int = 1 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 514 + type_vocab_size: int = 1 + layer_norm_eps: float = 1e-05 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + use_cache: bool = True + is_decoder: bool = False + add_cross_attention: bool = False @auto_docstring(checkpoint="BridgeTower/bridgetower-base") +@strict(accept_kwargs=True) class BridgeTowerConfig(PreTrainedConfig): r""" share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`): @@ -164,55 +130,38 @@ class BridgeTowerConfig(PreTrainedConfig): model_type = "bridgetower" sub_configs = {"text_config": BridgeTowerTextConfig, "vision_config": BridgeTowerVisionConfig} - def __init__( - self, - share_cross_modal_transformer_layers=True, - hidden_act="gelu", - hidden_size=768, - initializer_factor=1, - layer_norm_eps=1e-05, - share_link_tower_layers=False, - link_tower_type="add", - num_attention_heads=12, - num_hidden_layers=6, - tie_word_embeddings=False, - init_layernorm_from_vision_encoder=False, - text_config=None, - vision_config=None, - **kwargs, - ): + share_cross_modal_transformer_layers: bool = True + hidden_act: str = "gelu" + hidden_size: int = 768 + initializer_factor: float | int = 1 + layer_norm_eps: float = 1e-05 + share_link_tower_layers: bool = False + link_tower_type: str = "add" + num_attention_heads: int = 12 + num_hidden_layers: int = 6 + tie_word_embeddings: bool = False + init_layernorm_from_vision_encoder: bool = False + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): # TODO: remove this once the Hub files are updated. _ = kwargs.pop("text_config_dict", None) _ = kwargs.pop("vision_config_dict", None) - self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers - self.hidden_act = hidden_act - self.hidden_size = hidden_size - self.initializer_factor = initializer_factor - self.layer_norm_eps = layer_norm_eps - self.share_link_tower_layers = share_link_tower_layers - self.link_tower_type = link_tower_type - self.num_attention_heads = num_attention_heads - self.num_hidden_layers = num_hidden_layers - self.tie_word_embeddings = tie_word_embeddings - self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder - - if text_config is None: - text_config = BridgeTowerTextConfig() + if self.text_config is None: + self.text_config = BridgeTowerTextConfig() logger.info("`text_config` is `None`. initializing the `BridgeTowerTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = BridgeTowerTextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = BridgeTowerTextConfig(**self.text_config) - if vision_config is None: - vision_config = BridgeTowerVisionConfig() + if self.vision_config is None: + self.vision_config = BridgeTowerVisionConfig() logger.info("`vision_config` is `None`. initializing the `BridgeTowerVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = BridgeTowerVisionConfig(**vision_config) + elif isinstance(self.vision_config, dict): + self.vision_config = BridgeTowerVisionConfig(**self.vision_config) - self.text_config = text_config - self.vision_config = vision_config - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["BridgeTowerConfig", "BridgeTowerTextConfig", "BridgeTowerVisionConfig"] diff --git a/src/transformers/models/bros/configuration_bros.py b/src/transformers/models/bros/configuration_bros.py index eaff488e157e..3f5dc5a5c58d 100644 --- a/src/transformers/models/bros/configuration_bros.py +++ b/src/transformers/models/bros/configuration_bros.py @@ -13,14 +13,14 @@ # limitations under the License. """Bros model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="jinho8345/bros-base-uncased") +@strict(accept_kwargs=True) class BrosConfig(PreTrainedConfig): r""" dim_bbox (`int`, *optional*, defaults to 8): @@ -47,53 +47,31 @@ class BrosConfig(PreTrainedConfig): model_type = "bros" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - dim_bbox=8, - bbox_scale=100.0, - n_relations=1, - classifier_dropout_prob=0.1, - is_decoder=False, - add_cross_attention=False, - **kwargs, - ): - super().__init__(**kwargs) + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + dim_bbox: int = 8 + bbox_scale: float = 100.0 + n_relations: int = 1 + classifier_dropout_prob: float = 0.1 + is_decoder: bool = False + add_cross_attention: bool = False - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.pad_token_id = pad_token_id - self.dim_bbox = dim_bbox - self.bbox_scale = bbox_scale - self.n_relations = n_relations + def __post_init__(self, **kwargs): self.dim_bbox_sinusoid_emb_2d = self.hidden_size // 4 self.dim_bbox_sinusoid_emb_1d = self.dim_bbox_sinusoid_emb_2d // self.dim_bbox self.dim_bbox_projection = self.hidden_size // self.num_attention_heads - self.classifier_dropout_prob = classifier_dropout_prob + super().__post_init__(**kwargs) __all__ = ["BrosConfig"] diff --git a/src/transformers/models/camembert/configuration_camembert.py b/src/transformers/models/camembert/configuration_camembert.py index c303c5864f2a..4e3eadf26e04 100644 --- a/src/transformers/models/camembert/configuration_camembert.py +++ b/src/transformers/models/camembert/configuration_camembert.py @@ -14,14 +14,14 @@ # limitations under the License. """CamemBERT configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="almanach/camembert-base") +@strict(accept_kwargs=True) class CamembertConfig(PreTrainedConfig): r""" Example: @@ -41,50 +41,25 @@ class CamembertConfig(PreTrainedConfig): model_type = "camembert" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - use_cache=True, - classifier_dropout=None, - is_decoder=False, - add_cross_attention=False, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + use_cache: bool = True + classifier_dropout: float | int | None = None + is_decoder: bool = False + add_cross_attention: bool = False __all__ = ["CamembertConfig"] diff --git a/src/transformers/models/canine/configuration_canine.py b/src/transformers/models/canine/configuration_canine.py index 891915aa9e60..185eba28d3f0 100644 --- a/src/transformers/models/canine/configuration_canine.py +++ b/src/transformers/models/canine/configuration_canine.py @@ -13,14 +13,14 @@ # limitations under the License. """CANINE model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/canine-s") +@strict(accept_kwargs=True) class CanineConfig(PreTrainedConfig): r""" downsampling_rate (`int`, *optional*, defaults to 4): @@ -54,52 +54,25 @@ class CanineConfig(PreTrainedConfig): model_type = "canine" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=16384, - type_vocab_size=16, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=0xE000, - eos_token_id=0xE001, - downsampling_rate=4, - upsampling_kernel_size=4, - num_hash_functions=8, - num_hash_buckets=16384, - local_transformer_stride=128, # Good TPU/XLA memory alignment. - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.type_vocab_size = type_vocab_size - self.layer_norm_eps = layer_norm_eps - - # Character config: - self.downsampling_rate = downsampling_rate - self.upsampling_kernel_size = upsampling_kernel_size - self.num_hash_functions = num_hash_functions - self.num_hash_buckets = num_hash_buckets - self.local_transformer_stride = local_transformer_stride + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 16384 + type_vocab_size: int = 16 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = 0xE000 + eos_token_id: int | list[int] | None = 0xE001 + downsampling_rate: int = 4 + upsampling_kernel_size: int = 4 + num_hash_functions: int = 8 + num_hash_buckets: int = 16384 + local_transformer_stride: int = 128 # Good TPU/XLA memory alignment __all__ = ["CanineConfig"] diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py index 1aed9699846c..01c251796eeb 100644 --- a/src/transformers/models/canine/modeling_canine.py +++ b/src/transformers/models/canine/modeling_canine.py @@ -848,7 +848,7 @@ def forward( ) all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1019,7 +1019,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.canine( input_ids, @@ -1127,7 +1127,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1235,7 +1235,7 @@ def forward( >>> loss = model(**inputs, labels=labels).loss >>> round(loss.item(), 2) # doctest: +SKIP ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.canine( input_ids, @@ -1297,7 +1297,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.canine( input_ids, diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index 05e27f73d8e5..e8f0750d7387 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -13,6 +13,8 @@ # limitations under the License. """chameleon model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -22,6 +24,7 @@ @auto_docstring(checkpoint="facebook/chameleon-7b") +@strict(accept_kwargs=True) class ChameleonVQVAEConfig(PreTrainedConfig): r""" base_channels (`int`, *optional*, defaults to 128): @@ -43,40 +46,23 @@ class ChameleonVQVAEConfig(PreTrainedConfig): model_type = "chameleon_vqgan" base_config_key = "vq_config" - def __init__( - self, - embed_dim: int = 256, - num_embeddings: int = 8192, - double_latent: bool = False, - latent_channels: int = 256, - resolution: int = 512, - in_channels: int = 3, - base_channels: int = 128, - channel_multiplier: list[int] = [1, 1, 2, 2, 4], - num_res_blocks: int = 2, - attn_resolutions: list[int] | None = None, - dropout: float = 0.0, - attn_type: str = "vanilla", - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.embed_dim = embed_dim - self.num_embeddings = num_embeddings - self.double_latent = double_latent - self.latent_channels = latent_channels - self.resolution = resolution - self.in_channels = in_channels - self.base_channels = base_channels - self.channel_multiplier = channel_multiplier - self.num_res_blocks = num_res_blocks - self.attn_resolutions = attn_resolutions - self.dropout = dropout - self.attn_type = attn_type - self.initializer_range = initializer_range + embed_dim: int = 256 + num_embeddings: int = 8192 + double_latent: bool = False + latent_channels: int = 256 + resolution: int = 512 + in_channels: int = 3 + base_channels: int = 128 + channel_multiplier: list[int] | tuple[int, ...] = (1, 1, 2, 2, 4) + num_res_blocks: int = 2 + attn_resolutions: list[int] | None = None + dropout: float | int = 0.0 + attn_type: str = "vanilla" + initializer_range = 0.02 @auto_docstring(checkpoint="facebook/chameleon-7b") +@strict(accept_kwargs=True) class ChameleonConfig(PreTrainedConfig): r""" model_parallel_size (`int`, *optional*, defaults to 1): @@ -104,66 +90,40 @@ class ChameleonConfig(PreTrainedConfig): sub_configs = {"vq_config": ChameleonVQVAEConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 65536, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 32, - hidden_act: int | None = "silu", - max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-05, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: int | None = False, - attention_dropout: float | None = 0.0, - model_parallel_size: int | None = 1, - swin_norm: bool | None = False, - vq_config: dict | None = None, - vocabulary_map: dict | None = None, - mlp_bias: bool | None = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mlp_bias = mlp_bias - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.model_parallel_size = model_parallel_size - self.swin_norm = swin_norm - self.rope_parameters = rope_parameters - - if vq_config is None: - vq_config = {} + vocab_size: int = 65536 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 32 + hidden_act: str = "silu" + max_position_embeddings: int = 4096 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: int | None = False + attention_dropout: float | int | None = 0.0 + model_parallel_size: int | None = 1 + swin_norm: bool | None = False + vq_config: dict | PreTrainedConfig | None = None + vocabulary_map: dict | None = None + mlp_bias: bool = False + + def __post_init__(self, **kwargs): + if self.vq_config is None: logger.info("vq_config is None. initializing the ChameleonVQConfig with default values.") + self.vq_config = ChameleonVQVAEConfig() + elif isinstance(self.vq_config, dict): + self.vq_config = ChameleonVQVAEConfig(**self.vq_config) - self.vq_config = ChameleonVQVAEConfig(**vq_config) - - self.vocabulary_map = vocabulary_map - self.image_token_id = vocabulary_map.get("") if vocabulary_map is not None else None + self.image_token_id = self.vocabulary_map.get("") if self.vocabulary_map is not None else None - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["ChameleonConfig", "ChameleonVQVAEConfig"] diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py index 875fc242e355..448167f07be0 100644 --- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py +++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py @@ -13,6 +13,8 @@ # limitations under the License. """Chinese-CLIP model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="OFA-Sys/chinese-clip-vit-base-patch16") +@strict(accept_kwargs=True) class ChineseCLIPTextConfig(PreTrainedConfig): r""" type_vocab_size (`int`, *optional*, defaults to 2): @@ -44,46 +47,26 @@ class ChineseCLIPTextConfig(PreTrainedConfig): model_type = "chinese_clip_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - initializer_factor=1.0, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=0, - eos_token_id=None, - **kwargs, - ): - super().__init__(**kwargs) - self.bos_token_id = bos_token_id - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.layer_norm_eps = layer_norm_eps + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = None @auto_docstring(checkpoint="OFA-Sys/chinese-clip-vit-base-patch16") +@strict(accept_kwargs=True) class ChineseCLIPVisionConfig(PreTrainedConfig): r""" Example: @@ -103,41 +86,23 @@ class ChineseCLIPVisionConfig(PreTrainedConfig): model_type = "chinese_clip_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=32, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act + hidden_size: int = 768 + intermediate_size: int = 3072 + projection_dim: int = 512 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 32 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 @auto_docstring(checkpoint="OFA-Sys/chinese-clip-vit-base-patch16") +@strict(accept_kwargs=True) class ChineseCLIPConfig(PreTrainedConfig): r""" Example: @@ -166,22 +131,38 @@ class ChineseCLIPConfig(PreTrainedConfig): model_type = "chinese_clip" sub_configs = {"text_config": ChineseCLIPTextConfig, "vision_config": ChineseCLIPVisionConfig} - def __init__( - self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs - ): - # If `_config_dict` exist, we use them for the backward compatibility. - # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot - # of confusion!). - text_config_dict = kwargs.pop("text_config_dict", None) - vision_config_dict = kwargs.pop("vision_config_dict", None) + text_config: dict | ChineseCLIPTextConfig | None = None + vision_config: dict | ChineseCLIPVisionConfig | None = None + projection_dim: int = 512 + logit_scale_init_value: float | int = 2.6592 + initializer_factor: float = 1.0 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `ChineseCLIPTextConfig` with default values.") + elif isinstance(self.text_config, ChineseCLIPTextConfig): + text_config = self.text_config.to_dict() + else: + text_config = self.text_config + + if self.vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. initializing the `ChineseCLIPVisionConfig` with default values.") + elif isinstance(self.vision_config, ChineseCLIPVisionConfig): + vision_config = self.vision_config.to_dict() + else: + vision_config = self.vision_config + # For backward compatibility check keyword args # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. - if text_config_dict is not None: - if text_config is None: - text_config = {} + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + if text_config_dict is not None: # This is the complete result when using `text_config_dict`. _text_config_dict = ChineseCLIPTextConfig(**text_config_dict).to_dict() @@ -197,8 +178,8 @@ def __init__( # If inferred from default argument values (just to be super careful) else: message = ( - f"`text_config_dict` is provided which will be used to initialize `ChineseCLIPTextConfig`. " - f'The value `text_config["{key}"]` will be overridden.' + f"`text_config_dict` is provided which will be used to initialize `ChineseCLIPTextConfig`. The " + f'value `text_config["{key}"]` will be overridden.' ) logger.info(message) @@ -206,9 +187,6 @@ def __init__( text_config.update(_text_config_dict) if vision_config_dict is not None: - if vision_config is None: - vision_config = {} - # This is the complete result when using `vision_config_dict`. _vision_config_dict = ChineseCLIPVisionConfig(**vision_config_dict).to_dict() # convert keys to string instead of integer @@ -229,34 +207,19 @@ def __init__( # If inferred from default argument values (just to be super careful) else: message = ( - f"`vision_config_dict` is provided which will be used to initialize " - f'`ChineseCLIPVisionConfig`. The value `vision_config["{key}"]` will be overridden.' + f"`vision_config_dict` is provided which will be used to initialize `ChineseCLIPVisionConfig`. " + f'The value `vision_config["{key}"]` will be overridden.' ) logger.info(message) # Update all values in `vision_config` with the ones in `_vision_config_dict`. vision_config.update(_vision_config_dict) - if text_config is None: - text_config = ChineseCLIPTextConfig() - logger.info("`text_config` is `None`. initializing the `ChineseCLIPTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = ChineseCLIPTextConfig(**text_config) - - if vision_config is None: - vision_config = ChineseCLIPVisionConfig() - logger.info("`vision_config` is `None`. initializing the `ChineseCLIPVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = ChineseCLIPVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config + # Finally we can convert back our unified text/vision configs to `PretrainedConfig` + self.text_config = ChineseCLIPTextConfig(**text_config) + self.vision_config = ChineseCLIPVisionConfig(**vision_config) - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = 1.0 - self.initializer_range = 0.02 - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["ChineseCLIPConfig", "ChineseCLIPTextConfig", "ChineseCLIPVisionConfig"] diff --git a/src/transformers/models/chmv2/configuration_chmv2.py b/src/transformers/models/chmv2/configuration_chmv2.py index 89cf37422074..7ee1c7c563d1 100644 --- a/src/transformers/models/chmv2/configuration_chmv2.py +++ b/src/transformers/models/chmv2/configuration_chmv2.py @@ -18,6 +18,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Literal + +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -25,6 +29,7 @@ @auto_docstring(checkpoint="facebook/dinov3-vitl16-chmv2-dpt-head") +@strict(accept_kwargs=True) class CHMv2Config(PreTrainedConfig): r""" backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*): @@ -66,27 +71,25 @@ class CHMv2Config(PreTrainedConfig): model_type = "chmv2" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - backbone_config: dict | None = None, - patch_size: int | None = 16, - initializer_range: float | None = 0.02, - reassemble_factors: list[float] | None = None, - post_process_channels: list[int] | None = None, - fusion_hidden_size: int | None = 256, - head_hidden_size: int | None = 128, - number_output_channels: int | None = 256, - readout_type: str | None = "project", - min_depth: float | None = 0.001, - max_depth: float | None = 96.0, - bins_strategy: str | None = "chmv2_mixlog", - norm_strategy: str | None = "chmv2_mixlog", - **kwargs, - ): - if reassemble_factors is None: - reassemble_factors = [4, 2, 1, 0.5] - if post_process_channels is None: - post_process_channels = [128, 256, 512, 1024] + backbone_config: dict | PreTrainedConfig | None = None + patch_size: int = 16 + initializer_range: float = 0.02 + reassemble_factors: list[float | int] | None = None + post_process_channels: list[int] | None = None + fusion_hidden_size: int = 256 + head_hidden_size: int = 128 + number_output_channels: int = 256 + readout_type: str = "project" + min_depth: float = 0.001 + max_depth: float = 96.0 + bins_strategy: Literal["linear", "log", "chmv2_mixlog"] = "chmv2_mixlog" + norm_strategy: Literal["linear", "softmax", "sigmoid", "chmv2_mixlog"] = "chmv2_mixlog" + + def __post_init__(self, **kwargs): + if self.reassemble_factors is None: + self.reassemble_factors = [4, 2, 1, 0.5] + if self.post_process_channels is None: + self.post_process_channels = [128, 256, 512, 1024] default_config_kwargs = { "image_size": 416, @@ -103,34 +106,14 @@ def __init__( "return_class_token": True, } - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="dinov3_vit", default_config_kwargs=default_config_kwargs, **kwargs, ) - self.backbone_config = backbone_config - self.patch_size = patch_size - self.initializer_range = initializer_range - self.reassemble_factors = reassemble_factors - self.post_process_channels = post_process_channels - self.fusion_hidden_size = fusion_hidden_size - self.head_hidden_size = head_hidden_size - self.number_output_channels = number_output_channels - self.readout_type = readout_type - - if bins_strategy not in ["linear", "log", "chmv2_mixlog"]: - raise ValueError("bins_strategy must be one of ['linear', 'log', 'chmv2_mixlog']") - if norm_strategy not in ["linear", "softmax", "sigmoid", "chmv2_mixlog"]: - raise ValueError("norm_strategy must be one of ['linear', 'softmax', 'sigmoid', 'chmv2_mixlog']") - - self.min_depth = min_depth - self.max_depth = max_depth - self.bins_strategy = bins_strategy - self.norm_strategy = norm_strategy - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["CHMv2Config"] diff --git a/src/transformers/models/chmv2/modular_chmv2.py b/src/transformers/models/chmv2/modular_chmv2.py index 3b1f08a3f960..1b8e1bd31d23 100644 --- a/src/transformers/models/chmv2/modular_chmv2.py +++ b/src/transformers/models/chmv2/modular_chmv2.py @@ -13,7 +13,10 @@ # limitations under the License. """CHMv2 model — Canopy Height Model v2, adapted from DPT.""" +from typing import Literal + import torch +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -32,6 +35,7 @@ @auto_docstring(checkpoint="facebook/dinov3-vitl16-chmv2-dpt-head") +@strict(accept_kwargs=True) class CHMv2Config(PreTrainedConfig): r""" backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*): @@ -73,27 +77,25 @@ class CHMv2Config(PreTrainedConfig): model_type = "chmv2" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - backbone_config: dict | None = None, - patch_size: int | None = 16, - initializer_range: float | None = 0.02, - reassemble_factors: list[float] | None = None, - post_process_channels: list[int] | None = None, - fusion_hidden_size: int | None = 256, - head_hidden_size: int | None = 128, - number_output_channels: int | None = 256, - readout_type: str | None = "project", - min_depth: float | None = 0.001, - max_depth: float | None = 96.0, - bins_strategy: str | None = "chmv2_mixlog", - norm_strategy: str | None = "chmv2_mixlog", - **kwargs, - ): - if reassemble_factors is None: - reassemble_factors = [4, 2, 1, 0.5] - if post_process_channels is None: - post_process_channels = [128, 256, 512, 1024] + backbone_config: dict | PreTrainedConfig | None = None + patch_size: int = 16 + initializer_range: float = 0.02 + reassemble_factors: list[float | int] | None = None + post_process_channels: list[int] | None = None + fusion_hidden_size: int = 256 + head_hidden_size: int = 128 + number_output_channels: int = 256 + readout_type: str = "project" + min_depth: float = 0.001 + max_depth: float = 96.0 + bins_strategy: Literal["linear", "log", "chmv2_mixlog"] = "chmv2_mixlog" + norm_strategy: Literal["linear", "softmax", "sigmoid", "chmv2_mixlog"] = "chmv2_mixlog" + + def __post_init__(self, **kwargs): + if self.reassemble_factors is None: + self.reassemble_factors = [4, 2, 1, 0.5] + if self.post_process_channels is None: + self.post_process_channels = [128, 256, 512, 1024] default_config_kwargs = { "image_size": 416, @@ -110,34 +112,14 @@ def __init__( "return_class_token": True, } - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="dinov3_vit", default_config_kwargs=default_config_kwargs, **kwargs, ) - self.backbone_config = backbone_config - self.patch_size = patch_size - self.initializer_range = initializer_range - self.reassemble_factors = reassemble_factors - self.post_process_channels = post_process_channels - self.fusion_hidden_size = fusion_hidden_size - self.head_hidden_size = head_hidden_size - self.number_output_channels = number_output_channels - self.readout_type = readout_type - - if bins_strategy not in ["linear", "log", "chmv2_mixlog"]: - raise ValueError("bins_strategy must be one of ['linear', 'log', 'chmv2_mixlog']") - if norm_strategy not in ["linear", "softmax", "sigmoid", "chmv2_mixlog"]: - raise ValueError("norm_strategy must be one of ['linear', 'softmax', 'sigmoid', 'chmv2_mixlog']") - - self.min_depth = min_depth - self.max_depth = max_depth - self.bins_strategy = bins_strategy - self.norm_strategy = norm_strategy - - super().__init__(**kwargs) + super().__post_init__(**kwargs) class CHMv2ImageProcessorKwargs(ImagesKwargs, total=False): diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py index 20d4bc1f8ba7..10f1f8506f3e 100644 --- a/src/transformers/models/clap/configuration_clap.py +++ b/src/transformers/models/clap/configuration_clap.py @@ -13,6 +13,8 @@ # limitations under the License. """CLAP model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="laion/clap-htsat-fused") +@strict(accept_kwargs=True) class ClapTextConfig(PreTrainedConfig): r""" @@ -42,49 +45,27 @@ class ClapTextConfig(PreTrainedConfig): model_type = "clap_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=50265, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=514, - type_vocab_size=1, - initializer_factor=1.0, - layer_norm_eps=1e-12, - projection_dim=512, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - projection_hidden_act="relu", - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_factor = initializer_factor - self.layer_norm_eps = layer_norm_eps - self.projection_hidden_act = projection_hidden_act - self.projection_dim = projection_dim + vocab_size: int = 50265 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 514 + type_vocab_size: int = 1 + initializer_factor: float = 1.0 + layer_norm_eps: float = 1e-12 + projection_dim: int = 512 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + projection_hidden_act: str = "relu" @auto_docstring(checkpoint="laion/clap-htsat-fused") +@strict(accept_kwargs=True) class ClapAudioConfig(PreTrainedConfig): r""" window_size (`int`, *optional*, defaults to 8): @@ -131,69 +112,37 @@ class ClapAudioConfig(PreTrainedConfig): model_type = "clap_audio_model" base_config_key = "audio_config" - def __init__( - self, - window_size=8, - num_mel_bins=64, - spec_size=256, - hidden_act="gelu", - patch_size=4, - patch_stride=[4, 4], - num_classes=527, - hidden_size=768, - projection_dim=512, - depths=[2, 2, 6, 2], - num_attention_heads=[4, 8, 16, 32], - enable_fusion=False, - hidden_dropout_prob=0.1, - fusion_type=None, - patch_embed_input_channels=1, - flatten_patch_embeds=True, - patch_embeds_hidden_size=96, - enable_patch_layer_norm=True, - drop_path_rate=0.0, - attention_probs_dropout_prob=0.0, - qkv_bias=True, - mlp_ratio=4.0, - aff_block_r=4, - num_hidden_layers=4, - projection_hidden_act="relu", - layer_norm_eps=1e-5, - initializer_factor=1.0, - **kwargs, - ): - super().__init__(**kwargs) - self.window_size = window_size - self.num_mel_bins = num_mel_bins - self.spec_size = spec_size - self.patch_size = patch_size - self.patch_stride = patch_stride - self.num_classes = num_classes - self.hidden_size = hidden_size - self.depths = depths - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.window_size = window_size - self.enable_fusion = enable_fusion - self.fusion_type = fusion_type - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.projection_dim = projection_dim - self.flatten_patch_embeds = flatten_patch_embeds - self.patch_embeds_hidden_size = patch_embeds_hidden_size - self.enable_patch_layer_norm = enable_patch_layer_norm - self.drop_path_rate = drop_path_rate - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.qkv_bias = qkv_bias - self.mlp_ratio = mlp_ratio - self.patch_embed_input_channels = patch_embed_input_channels - self.aff_block_r = aff_block_r - self.layer_norm_eps = layer_norm_eps - self.initializer_factor = initializer_factor - self.projection_hidden_act = projection_hidden_act + window_size: int = 8 + num_mel_bins: int = 64 + spec_size: int = 256 + hidden_act: str = "gelu" + patch_size: int | list[int] | tuple[int, int] = 4 + patch_stride: int | list[int] | tuple[int, ...] = (4, 4) + num_classes: int = 527 + hidden_size: int = 768 + projection_dim: int = 512 + depths: list[int] | tuple[int, ...] = (2, 2, 6, 2) + num_attention_heads: list[int] | tuple[int, ...] = (4, 8, 16, 32) + enable_fusion: bool = False + hidden_dropout_prob: float = 0.1 + fusion_type: str | None = None + patch_embed_input_channels: int = 1 + flatten_patch_embeds: bool = True + patch_embeds_hidden_size: int = 96 + enable_patch_layer_norm: bool = True + drop_path_rate: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + qkv_bias: bool = True + mlp_ratio: float = 4.0 + aff_block_r: int = 4 + num_hidden_layers: int = 4 + projection_hidden_act: str = "relu" + layer_norm_eps: float = 1e-5 + initializer_factor: float = 1.0 @auto_docstring(checkpoint="laion/clap-htsat-fused") +@strict(accept_kwargs=True) class ClapConfig(PreTrainedConfig): r""" Example: @@ -223,45 +172,34 @@ class ClapConfig(PreTrainedConfig): model_type = "clap" sub_configs = {"text_config": ClapTextConfig, "audio_config": ClapAudioConfig} - def __init__( - self, - text_config=None, - audio_config=None, - logit_scale_init_value=(1 / 0.07), - projection_dim=512, - projection_hidden_act="relu", - initializer_factor=1.0, - **kwargs, - ): - if text_config is None: - text_config = ClapTextConfig() + text_config: dict | PreTrainedConfig | None = None + audio_config: dict | PreTrainedConfig | None = None + logit_scale_init_value: float = 1 / 0.07 + projection_dim: int = 512 + projection_hidden_act: str = "relu" + initializer_factor: float = 1.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = ClapTextConfig() logger.info("`text_config` is `None`. initializing the `ClapTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = ClapTextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = ClapTextConfig(**self.text_config) - if audio_config is None: - audio_config = ClapAudioConfig() + if self.audio_config is None: + self.audio_config = ClapAudioConfig() logger.info("`audio_config` is `None`. initializing the `ClapAudioConfig` with default values.") - elif isinstance(audio_config, dict): - audio_config = ClapAudioConfig(**audio_config) - - self.text_config = text_config - self.audio_config = audio_config + elif isinstance(self.audio_config, dict): + self.audio_config = ClapAudioConfig(**self.audio_config) - self.text_config.projection_dim = projection_dim - self.audio_config.projection_dim = projection_dim + self.text_config.projection_dim = self.projection_dim + self.audio_config.projection_dim = self.projection_dim - self.text_config.projection_hidden_act = projection_hidden_act - self.audio_config.projection_hidden_act = projection_hidden_act - - self.projection_dim = projection_dim - self.projection_hidden_act = projection_hidden_act + self.text_config.projection_hidden_act = self.projection_hidden_act + self.audio_config.projection_hidden_act = self.projection_hidden_act self.hidden_size = self.text_config.hidden_size - - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = initializer_factor self.num_hidden_layers = self.text_config.num_hidden_layers + len(self.audio_config.depths) - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["ClapAudioConfig", "ClapConfig", "ClapTextConfig"] diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index fb0da2360d68..96c540a3424f 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -272,7 +272,7 @@ def __init__(self, config: ClapAudioConfig): padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2) - scale_factor = 4 if (self.enable_fusion) and (config.fusion_type == "channel_map") else 1 + scale_factor = 4 if self.enable_fusion and config.fusion_type == "channel_map" else 1 self.proj = nn.Conv2d( config.patch_embed_input_channels * scale_factor, diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 2a2d8688605a..107a1e9d3b25 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -13,6 +13,8 @@ # limitations under the License. """CLIP model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="openai/clip-vit-base-patch32") +@strict(accept_kwargs=True) class CLIPTextConfig(PreTrainedConfig): r""" Example: @@ -41,47 +44,36 @@ class CLIPTextConfig(PreTrainedConfig): model_type = "clip_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=49408, - hidden_size=512, - intermediate_size=2048, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=8, - max_position_embeddings=77, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - # This differs from `CLIPTokenizer`'s default and from openai/clip - # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 - pad_token_id=1, - bos_token_id=49406, - eos_token_id=49407, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout + vocab_size: int = 49408 + hidden_size: int = 512 + intermediate_size: int = 2048 + projection_dim: int | None = 512 + num_hidden_layers: int = 12 + num_attention_heads: int = 8 + max_position_embeddings: int = 77 + hidden_act: str = "quick_gelu" + layer_norm_eps: float | None = 1e-5 + attention_dropout: int | float | None = 0.0 + initializer_range: float = 0.02 + initializer_factor: float | None = 1.0 + + # This differs from `CLIPTokenizer`'s default and from openai/clip + # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 + pad_token_id: int | None = 1 + bos_token_id: int | None = 49406 + eos_token_id: int | list[int] | None = 49407 + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) @auto_docstring(checkpoint="openai/clip-vit-base-patch32") +@strict(accept_kwargs=True) class CLIPVisionConfig(PreTrainedConfig): r""" Example: @@ -102,43 +94,53 @@ class CLIPVisionConfig(PreTrainedConfig): model_type = "clip_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=32, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act + hidden_size: int = 768 + intermediate_size: int = 3072 + projection_dim: int | None = 512 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int | None = 3 + image_size: int | None = 224 + patch_size: int | None = 32 + hidden_act: str = "quick_gelu" + layer_norm_eps: float | None = 1e-5 + attention_dropout: int | float | None = 0.0 + initializer_range: float = 0.02 + initializer_factor: float | None = 1.0 + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) @auto_docstring(checkpoint="openai/clip-vit-base-patch32") +@strict(accept_kwargs=True) class CLIPConfig(PreTrainedConfig): r""" + [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate + a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating + a configuration with the defaults will yield a similar configuration to that of the CLIP + [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`CLIPTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`CLIPVisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float | int`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation. + kwargs (*optional*): + Dictionary of keyword arguments. + Example: ```python @@ -166,22 +168,37 @@ class CLIPConfig(PreTrainedConfig): model_type = "clip" sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig} - def __init__( - self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs - ): - # If `_config_dict` exist, we use them for the backward compatibility. - # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot - # of confusion!). - text_config_dict = kwargs.pop("text_config_dict", None) - vision_config_dict = kwargs.pop("vision_config_dict", None) + text_config: dict | CLIPTextConfig | None = None + vision_config: dict | CLIPVisionConfig | None = None + projection_dim: int | None = 512 + logit_scale_init_value: float | int | None = 2.6592 + initializer_factor: float | None = 1.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.") + elif isinstance(self.text_config, CLIPTextConfig): + text_config = self.text_config.to_dict() + else: + text_config = self.text_config + + if self.vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.") + elif isinstance(self.vision_config, CLIPVisionConfig): + vision_config = self.vision_config.to_dict() + else: + vision_config = self.vision_config + # For backward compatibility check keyword args # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. - if text_config_dict is not None: - if text_config is None: - text_config = {} + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + if text_config_dict is not None: # This is the complete result when using `text_config_dict`. _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict() @@ -206,9 +223,6 @@ def __init__( text_config.update(_text_config_dict) if vision_config_dict is not None: - if vision_config is None: - vision_config = {} - # This is the complete result when using `vision_config_dict`. _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict() # convert keys to string instead of integer @@ -237,25 +251,11 @@ def __init__( # Update all values in `vision_config` with the ones in `_vision_config_dict`. vision_config.update(_vision_config_dict) - if text_config is None: - text_config = CLIPTextConfig() - logger.info("`text_config` is `None`. initializing the `CLIPTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = CLIPTextConfig(**text_config) - - if vision_config is None: - vision_config = CLIPVisionConfig() - logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = CLIPVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config + # Finally we can convert back our unified text/vision configs to `PretrainedConfig` + self.text_config = CLIPTextConfig(**text_config) + self.vision_config = CLIPVisionConfig(**vision_config) - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = 1.0 - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["CLIPConfig", "CLIPTextConfig", "CLIPVisionConfig"] diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 2530281650d5..923c650d2158 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -288,11 +288,6 @@ def __init__(self, config: CLIPVisionConfig | CLIPTextConfig): self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout self.is_causal = False diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py index 6e382145a2e0..a0ce571deff2 100644 --- a/src/transformers/models/clipseg/configuration_clipseg.py +++ b/src/transformers/models/clipseg/configuration_clipseg.py @@ -13,6 +13,8 @@ # limitations under the License. """CLIPSeg model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="CIDAS/clipseg-rd64") +@strict(accept_kwargs=True) class CLIPSegTextConfig(PreTrainedConfig): r""" Example: @@ -41,43 +44,24 @@ class CLIPSegTextConfig(PreTrainedConfig): model_type = "clipseg_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=49408, - hidden_size=512, - intermediate_size=2048, - num_hidden_layers=12, - num_attention_heads=8, - max_position_embeddings=77, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - pad_token_id=1, - bos_token_id=49406, - eos_token_id=49407, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout + vocab_size: int = 49408 + hidden_size: int = 512 + intermediate_size: int = 2048 + num_hidden_layers: int = 12 + num_attention_heads: int = 8 + max_position_embeddings: int = 77 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 + pad_token_id: int | None = 1 + bos_token_id: int | None = 49406 + eos_token_id: int | list[int] | None = 49407 @auto_docstring(checkpoint="CIDAS/clipseg-rd64") +@strict(accept_kwargs=True) class CLIPSegVisionConfig(PreTrainedConfig): r""" Example: @@ -98,39 +82,22 @@ class CLIPSegVisionConfig(PreTrainedConfig): model_type = "clipseg_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=32, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 32 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 @auto_docstring(checkpoint="CIDAS/clipseg-rd64") +@strict(accept_kwargs=True) class CLIPSegConfig(PreTrainedConfig): r""" extract_layers (`list[int]`, *optional*, defaults to `[3, 6, 9]`): @@ -170,35 +137,45 @@ class CLIPSegConfig(PreTrainedConfig): model_type = "clipseg" sub_configs = {"text_config": CLIPSegTextConfig, "vision_config": CLIPSegVisionConfig} - def __init__( - self, - text_config=None, - vision_config=None, - projection_dim=512, - logit_scale_init_value=2.6592, - extract_layers=[3, 6, 9], - reduce_dim=64, - decoder_num_attention_heads=4, - decoder_attention_dropout=0.0, - decoder_hidden_act="quick_gelu", - decoder_intermediate_size=2048, - conditional_layer=0, - use_complex_transposed_convolution=False, - **kwargs, - ): - # If `_config_dict` exist, we use them for the backward compatibility. - # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot - # of confusion!). - text_config_dict = kwargs.pop("text_config_dict", None) - vision_config_dict = kwargs.pop("vision_config_dict", None) + text_config: dict | CLIPSegTextConfig | None = None + vision_config: dict | CLIPSegVisionConfig | None = None + projection_dim: int | None = 512 + logit_scale_init_value: float | int | None = 2.6592 + initializer_factor: float | None = 1.0 + extract_layers: list[int] | tuple[int, ...] = (3, 6, 9) + reduce_dim: int = 64 + decoder_num_attention_heads: int = 4 + decoder_attention_dropout: float | int = 0.0 + decoder_hidden_act: str = "quick_gelu" + decoder_intermediate_size: int = 2048 + conditional_layer: int = 0 + use_complex_transposed_convolution: bool = False + + def __post_init__(self, **kwargs): + if self.text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `CLIPSegTextConfig` with default values.") + elif isinstance(self.text_config, CLIPSegTextConfig): + text_config = self.text_config.to_dict() + else: + text_config = self.text_config + + if self.vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. initializing the `CLIPSegVisionConfig` with default values.") + elif isinstance(self.vision_config, CLIPSegVisionConfig): + vision_config = self.vision_config.to_dict() + else: + vision_config = self.vision_config + # For backward compatibility check keyword args # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. - if text_config_dict is not None: - if text_config is None: - text_config = {} + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + if text_config_dict is not None: # This is the complete result when using `text_config_dict`. _text_config_dict = CLIPSegTextConfig(**text_config_dict).to_dict() @@ -223,9 +200,6 @@ def __init__( text_config.update(_text_config_dict) if vision_config_dict is not None: - if vision_config is None: - vision_config = {} - # This is the complete result when using `vision_config_dict`. _vision_config_dict = CLIPSegVisionConfig(**vision_config_dict).to_dict() # convert keys to string instead of integer @@ -254,33 +228,11 @@ def __init__( # Update all values in `vision_config` with the ones in `_vision_config_dict`. vision_config.update(_vision_config_dict) - if text_config is None: - text_config = CLIPSegTextConfig() - logger.info("`text_config` is `None`. initializing the `CLIPSegTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = CLIPSegTextConfig(**text_config) + # Finally we can convert back our unified text/vision configs to `PretrainedConfig` + self.text_config = CLIPSegTextConfig(**text_config) + self.vision_config = CLIPSegVisionConfig(**vision_config) - if vision_config is None: - vision_config = CLIPSegVisionConfig() - logger.info("`vision_config` is `None`. initializing the `CLIPSegVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = CLIPSegVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.extract_layers = extract_layers - self.reduce_dim = reduce_dim - self.decoder_num_attention_heads = decoder_num_attention_heads - self.decoder_attention_dropout = decoder_attention_dropout - self.decoder_hidden_act = decoder_hidden_act - self.decoder_intermediate_size = decoder_intermediate_size - self.conditional_layer = conditional_layer - self.initializer_factor = 1.0 - self.use_complex_transposed_convolution = use_complex_transposed_convolution - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["CLIPSegConfig", "CLIPSegTextConfig", "CLIPSegVisionConfig"] diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 488ac8a7ee83..33039177caad 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -655,7 +655,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is None: raise ValueError("You have to specify input_ids") @@ -797,7 +797,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) diff --git a/src/transformers/models/clvp/configuration_clvp.py b/src/transformers/models/clvp/configuration_clvp.py index c0ba00ea516b..16beba7a29d2 100644 --- a/src/transformers/models/clvp/configuration_clvp.py +++ b/src/transformers/models/clvp/configuration_clvp.py @@ -15,6 +15,8 @@ import os +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="susnato/clvp_dev") +@strict(accept_kwargs=True) class ClvpEncoderConfig(PreTrainedConfig): r""" use_rotary_embedding (`bool`, *optional*, defaults to `True`): @@ -50,46 +53,23 @@ class ClvpEncoderConfig(PreTrainedConfig): model_type = "clvp_encoder" base_config_key = ["text_config", "speech_config"] - def __init__( - self, - vocab_size=256, - hidden_size=768, - intermediate_size=1536, - projection_dim=768, - num_hidden_layers=20, - num_attention_heads=12, - hidden_act="gelu", - layer_norm_eps=1e-5, - attention_dropout=0.1, - dropout=0.1, - use_rotary_embedding=True, - use_attention_bias=False, - summary_type="mean", - initializer_factor=1.0, - bos_token_id=255, - eos_token_id=0, - pad_token_id=None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.dropout = dropout - self.use_rotary_embedding = use_rotary_embedding - self.use_attention_bias = use_attention_bias - self.summary_type = summary_type - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - - super().__init__(**kwargs) + vocab_size: int = 256 + hidden_size: int = 768 + intermediate_size: int = 1536 + projection_dim: int = 768 + num_hidden_layers: int = 20 + num_attention_heads: int = 12 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.1 + dropout: float | int = 0.1 + use_rotary_embedding: bool = True + use_attention_bias: bool = False + summary_type: str = "mean" + initializer_factor: float = 1.0 + bos_token_id: int | None = 255 + eos_token_id: int | list[int] | None = 0 + pad_token_id: int | None = None @classmethod def from_pretrained( @@ -118,6 +98,7 @@ def from_pretrained( @auto_docstring(checkpoint="susnato/clvp_dev") +@strict(accept_kwargs=True) class ClvpDecoderConfig(PreTrainedConfig): r""" resid_pdrop (`float`, *optional*, defaults to 0.1): @@ -172,72 +153,38 @@ class ClvpDecoderConfig(PreTrainedConfig): model_type = "clvp_decoder" base_config_key = "decoder_config" - def __init__( - self, - vocab_size=8194, - max_position_embeddings=608, - max_text_tokens=404, - hidden_size=1024, - num_hidden_layers=30, - num_attention_heads=16, - n_inner=None, - num_mel_attn_blocks=6, - activation_function="gelu_new", - resid_pdrop=0.1, - embd_pdrop=0.1, - attention_dropout=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - summary_type="cls_index", - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, - use_cache=True, - bos_token_id=8192, - eos_token_id=8193, - pad_token_id=None, - feature_size=80, - use_attention_bias=True, - initializer_factor=1.0, - decoder_fixing_codes=[83, 45, 45, 248], - add_cross_attention=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.max_text_tokens = max_text_tokens - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.n_inner = n_inner - self.num_mel_attn_blocks = num_mel_attn_blocks - self.activation_function = activation_function - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attention_dropout = attention_dropout - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_first_dropout = summary_first_dropout - self.summary_proj_to_labels = summary_proj_to_labels - self.use_cache = use_cache - self.feature_size = feature_size - self.use_attention_bias = use_attention_bias - self.initializer_factor = initializer_factor - self.decoder_fixing_codes = decoder_fixing_codes - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.add_cross_attention = add_cross_attention - - super().__init__(**kwargs) + vocab_size: int = 8194 + max_position_embeddings: int = 608 + max_text_tokens: int = 404 + hidden_size: int = 1024 + num_hidden_layers: int = 30 + num_attention_heads: int = 16 + n_inner: int | None = None + num_mel_attn_blocks: int = 6 + activation_function: str = "gelu_new" + resid_pdrop: float = 0.1 + embd_pdrop: float = 0.1 + attention_dropout: float | int = 0.1 + layer_norm_epsilon: float = 1e-5 + initializer_range: float = 0.02 + summary_type: str = "cls_index" + summary_use_proj: bool = True + summary_activation: str | None = None + summary_proj_to_labels: bool = True + summary_first_dropout: float | int = 0.1 + use_cache: bool = True + bos_token_id: int | None = 8192 + eos_token_id: int | None = 8193 + pad_token_id: int | None = None + feature_size: int = 80 + use_attention_bias: bool = True + initializer_factor: float = 1.0 + decoder_fixing_codes: list[int] | tuple[int, ...] = (83, 45, 45, 248) + add_cross_attention: bool = False @auto_docstring(checkpoint="susnato/clvp_dev") +@strict(accept_kwargs=True) class ClvpConfig(PreTrainedConfig): r""" speech_config (`dict`, *optional*): @@ -277,42 +224,33 @@ class ClvpConfig(PreTrainedConfig): "decoder_config": ClvpDecoderConfig, } - def __init__( - self, - text_config=None, - speech_config=None, - decoder_config=None, - projection_dim=768, - logit_scale_init_value=2.6592, - initializer_factor=1.0, - **kwargs, - ): - if text_config is None: - text_config = ClvpEncoderConfig() + text_config: dict | PreTrainedConfig | None = None + speech_config: dict | PreTrainedConfig | None = None + decoder_config: dict | PreTrainedConfig | None = None + projection_dim: int = 768 + logit_scale_init_value: float = 2.6592 + initializer_factor: float = 1.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = ClvpEncoderConfig() logger.info("`text_config` is `None`. initializing the `ClvpEncoderConfig` with default values.") - elif isinstance(text_config, dict): - text_config = ClvpEncoderConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = ClvpEncoderConfig(**self.text_config) - if speech_config is None: - speech_config = ClvpEncoderConfig() + if self.speech_config is None: + self.speech_config = ClvpEncoderConfig() logger.info("`speech_config` is `None`. initializing the `ClvpEncoderConfig` with default values.") - elif isinstance(speech_config, dict): - speech_config = ClvpEncoderConfig(**speech_config) + elif isinstance(self.speech_config, dict): + self.speech_config = ClvpEncoderConfig(**self.speech_config) - if decoder_config is None: - decoder_config = ClvpDecoderConfig() + if self.decoder_config is None: + self.decoder_config = ClvpDecoderConfig() logger.info("`image_config` is `None`. initializing the `ClvpDecoderConfig` with default values.") - elif isinstance(decoder_config, dict): - decoder_config = ClvpDecoderConfig(**decoder_config) - - self.text_config = text_config - self.speech_config = speech_config - self.decoder_config = decoder_config + elif isinstance(self.decoder_config, dict): + self.decoder_config = ClvpDecoderConfig(**self.decoder_config) - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = initializer_factor - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["ClvpConfig", "ClvpDecoderConfig", "ClvpEncoderConfig"] diff --git a/src/transformers/models/codegen/configuration_codegen.py b/src/transformers/models/codegen/configuration_codegen.py index a696d6ddfc1c..e3ecc06c18bc 100644 --- a/src/transformers/models/codegen/configuration_codegen.py +++ b/src/transformers/models/codegen/configuration_codegen.py @@ -13,14 +13,14 @@ # limitations under the License. """CodeGen model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="Salesforce/codegen-2B-mono") +@strict(accept_kwargs=True) class CodeGenConfig(PreTrainedConfig): r""" n_ctx (`int`, *optional*, defaults to 2048): @@ -54,51 +54,24 @@ class CodeGenConfig(PreTrainedConfig): "num_hidden_layers": "n_layer", } - def __init__( - self, - vocab_size=50400, - n_positions=2048, - n_ctx=2048, - n_embd=4096, - n_layer=28, - n_head=16, - rotary_dim=64, - n_inner=None, - activation_function="gelu_new", - resid_pdrop=0.0, - embd_pdrop=0.0, - attn_pdrop=0.0, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - use_cache=True, - bos_token_id=50256, - eos_token_id=50256, - tie_word_embeddings=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.n_ctx = n_ctx - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.n_inner = n_inner - self.rotary_dim = rotary_dim - self.activation_function = activation_function - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.use_cache = use_cache - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vocab_size: int = 50400 + n_positions: int = 2048 + n_ctx: int = 2048 + n_embd: int = 4096 + n_layer: int = 28 + n_head: int = 16 + rotary_dim: int = 64 + n_inner: int | None = None + activation_function: str = "gelu_new" + resid_pdrop: float = 0.0 + embd_pdrop: float = 0.0 + attn_pdrop: float = 0.0 + layer_norm_epsilon: float = 1e-5 + initializer_range: float = 0.02 + use_cache: bool = True + bos_token_id: int | None = 50256 + eos_token_id: int | list[int] | None = 50256 + tie_word_embeddings: bool = False __all__ = ["CodeGenConfig"] diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index ccc540207789..14a821d7f4eb 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -331,7 +331,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -460,7 +460,7 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index 5bc877b467ff..adf4f7697dac 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -18,15 +18,15 @@ # limitations under the License. """Cohere model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="CohereForAI/c4ai-command-r-v01") +@strict(accept_kwargs=True) class CohereConfig(PreTrainedConfig): r""" logit_scale (`float`, *optional*, defaults to 0.0625): @@ -62,58 +62,32 @@ class CohereConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 8192, - intermediate_size: int | None = 22528, - logit_scale: float | None = 0.0625, - num_hidden_layers: int | None = 40, - num_attention_heads: int | None = 64, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - layer_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - bos_token_id: int | None = 5, - eos_token_id: int | None = 255001, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - use_qk_norm: bool | None = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.logit_scale = logit_scale - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.use_qk_norm = use_qk_norm - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 256000 + hidden_size: int = 8192 + intermediate_size: int = 22528 + logit_scale: float | None = 0.0625 + num_hidden_layers: int = 40 + num_attention_heads: int = 64 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + layer_norm_eps: float | None = 1e-5 + use_cache: bool = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 5 + eos_token_id: int | list[int] | None = 255001 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + use_qk_norm: bool | None = False + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) __all__ = ["CohereConfig"] diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index 443bdb9005a1..6161e444259e 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -18,12 +18,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="CohereForAI/c4ai-command-r-v01") +@strict(accept_kwargs=True) class Cohere2Config(PreTrainedConfig): r""" logit_scale (`float`, *optional*, defaults to 0.0625): @@ -60,75 +63,45 @@ class Cohere2Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 8192, - intermediate_size: int | None = 22528, - logit_scale: float | None = 0.0625, - num_hidden_layers: int | None = 40, - num_attention_heads: int | None = 64, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - layer_norm_eps: int | None = 1e-5, - use_cache: int | None = True, - pad_token_id: int | None = 0, - bos_token_id: int | None = 5, - eos_token_id: int | None = 255001, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.logit_scale = logit_scale - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.sliding_window = sliding_window - self.layer_types = layer_types + vocab_size: int = 256000 + hidden_size: int = 8192 + intermediate_size: int = 22528 + logit_scale: float = 0.0625 + num_hidden_layers: int = 40 + num_attention_heads: int = 64 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + use_cache: int = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 5 + eos_token_id: int | list[int] | None = 255001 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + sliding_window: int | None = 4096 + layer_types: list[str] | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads # Need to specify head_dim in the config so it can be used in the attention forward functions - self.head_dim = hidden_size // num_attention_heads - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings + self.head_dim = self.hidden_size // self.num_attention_heads # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub - self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4) - if self.layer_types is None: # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub - self._sliding_window_pattern = getattr(self, "sliding_window_pattern", 4) + _sliding_window_pattern = kwargs.pop("sliding_window_pattern", 4) self.layer_types = [ - "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention" + "sliding_attention" if bool((i + 1) % _sliding_window_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Cohere2Config"] diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index ceff0d1db06f..3dedea65db2c 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -16,9 +16,10 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast from ...modeling_rope_utils import ( @@ -46,6 +47,7 @@ @auto_docstring(checkpoint="CohereForAI/c4ai-command-r-v01") +@strict(accept_kwargs=True) class Cohere2Config(PreTrainedConfig): r""" logit_scale (`float`, *optional*, defaults to 0.0625): @@ -82,75 +84,45 @@ class Cohere2Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 8192, - intermediate_size: int | None = 22528, - logit_scale: float | None = 0.0625, - num_hidden_layers: int | None = 40, - num_attention_heads: int | None = 64, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - layer_norm_eps: int | None = 1e-5, - use_cache: int | None = True, - pad_token_id: int | None = 0, - bos_token_id: int | None = 5, - eos_token_id: int | None = 255001, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.logit_scale = logit_scale - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.sliding_window = sliding_window - self.layer_types = layer_types + vocab_size: int = 256000 + hidden_size: int = 8192 + intermediate_size: int = 22528 + logit_scale: float = 0.0625 + num_hidden_layers: int = 40 + num_attention_heads: int = 64 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + use_cache: int = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 5 + eos_token_id: int | list[int] | None = 255001 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + sliding_window: int | None = 4096 + layer_types: list[str] | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads # Need to specify head_dim in the config so it can be used in the attention forward functions - self.head_dim = hidden_size // num_attention_heads - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings + self.head_dim = self.hidden_size // self.num_attention_heads # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub - self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4) - if self.layer_types is None: # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub - self._sliding_window_pattern = getattr(self, "sliding_window_pattern", 4) + _sliding_window_pattern = kwargs.pop("sliding_window_pattern", 4) self.layer_types = [ - "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention" + "sliding_attention" if bool((i + 1) % _sliding_window_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) class Cohere2RotaryEmbedding(CohereRotaryEmbedding): diff --git a/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py b/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py index 5d11c1b54bb7..65e6fdd03954 100644 --- a/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py @@ -12,12 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="CohereLabs/command-a-vision-07-2025") +@strict(accept_kwargs=True) class Cohere2VisionConfig(PreTrainedConfig): r""" downsample_factor (`int`, *optional*, defaults to 2): @@ -29,25 +33,19 @@ class Cohere2VisionConfig(PreTrainedConfig): model_type = "cohere2_vision" sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - text_config=None, - downsample_factor=2, - image_token_id=255036, - alignment_intermediate_size=36864, - tie_word_embeddings=True, - **kwargs, - ): - self.downsample_factor = downsample_factor - self.image_token_id = image_token_id - self.alignment_intermediate_size = alignment_intermediate_size + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + downsample_factor: int = 2 + image_token_id: int = 255036 + alignment_intermediate_size: int = 36864 + tie_word_embeddings: bool = True - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["siglip_vision_model"]( + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "siglip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["siglip_vision_model"]( hidden_size=1152, intermediate_size=3072, image_size=512, @@ -55,17 +53,13 @@ def __init__( num_attention_heads=12, ) - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "cohere2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=tie_word_embeddings) + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "cohere2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=self.tie_word_embeddings) - self.text_config = text_config - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Cohere2VisionConfig"] diff --git a/src/transformers/models/colmodernvbert/configuration_colmodernvbert.py b/src/transformers/models/colmodernvbert/configuration_colmodernvbert.py index a10304b4cece..bf53b6b53bb4 100755 --- a/src/transformers/models/colmodernvbert/configuration_colmodernvbert.py +++ b/src/transformers/models/colmodernvbert/configuration_colmodernvbert.py @@ -18,8 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from copy import deepcopy -from typing import Any + +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -30,6 +30,7 @@ @auto_docstring(checkpoint="ModernVBERT/colmodernvbert-merged") +@strict(accept_kwargs=True) class ColModernVBertConfig(PreTrainedConfig): r""" Example: @@ -43,39 +44,35 @@ class ColModernVBertConfig(PreTrainedConfig): """ model_type = "colmodernvbert" - sub_configs: dict[str, Any] = {"vlm_config": PreTrainedConfig} - - def __init__( - self, - vlm_config=None, - embedding_dim: int = 128, - initializer_range: float = 0.02, - **kwargs, - ): - if vlm_config is None: - vlm_config = CONFIG_MAPPING["modernvbert"]() + sub_configs = {"vlm_config": PreTrainedConfig} + + vlm_config: dict | PreTrainedConfig | None = None + embedding_dim: int = 128 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.vlm_config is None: + self.vlm_config = CONFIG_MAPPING["modernvbert"]() logger.info( "`vlm_config` is `None`. Initializing `vlm_config` with the `ModernVBertConfig` with default values." ) - elif isinstance(vlm_config, dict): - vlm_config = deepcopy(vlm_config) - if "model_type" not in vlm_config: - raise KeyError( - "The `model_type` key is missing in the `vlm_config` dictionary. Please provide the model type." - ) - vlm_config = CONFIG_MAPPING[vlm_config["model_type"]](**vlm_config) - elif not isinstance(vlm_config, PreTrainedConfig): - raise TypeError( - f"Invalid type for `vlm_config`. Expected `PreTrainedConfig`, `dict`, or `None`, but got {type(vlm_config)}." + elif isinstance(self.vlm_config, dict): + self.vlm_config = CONFIG_MAPPING[self.vlm_config["model_type"]](**self.vlm_config) + + if not hasattr(self.vlm_config, "vocab_size"): + self.vlm_config.vocab_size = self.vlm_config.get_text_config().vocab_size + if self.vlm_config is None: + self.vlm_config = CONFIG_MAPPING["qwen2_vl"]() + logger.info( + "`vlm_config` is `None`. Initializing `vlm_config` with the `Qwen2VLConfig` with default values." ) + elif isinstance(self.vlm_config, dict): + self.vlm_config = CONFIG_MAPPING[self.vlm_config["model_type"]](**self.vlm_config) - if not hasattr(vlm_config, "vocab_size"): - vlm_config.vocab_size = vlm_config.get_text_config().vocab_size + if not hasattr(self.vlm_config, "vocab_size"): + self.vlm_config.vocab_size = self.vlm_config.get_text_config().vocab_size - self.vlm_config = vlm_config - self.embedding_dim = embedding_dim - self.initializer_range = initializer_range - super().__init__(**kwargs) + super().__post_init__(**kwargs) def get_text_config(self, *args, **kwargs) -> PreTrainedConfig: return self.vlm_config.get_text_config(*args, **kwargs) diff --git a/src/transformers/models/colmodernvbert/modular_colmodernvbert.py b/src/transformers/models/colmodernvbert/modular_colmodernvbert.py index d67e0c1552b8..4041d90b0570 100755 --- a/src/transformers/models/colmodernvbert/modular_colmodernvbert.py +++ b/src/transformers/models/colmodernvbert/modular_colmodernvbert.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from copy import deepcopy from dataclasses import dataclass -from typing import Any, Optional, Union +from typing import Optional, Union import torch +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...feature_extraction_utils import BatchFeature @@ -37,6 +37,7 @@ @auto_docstring(checkpoint="ModernVBERT/colmodernvbert-merged") +@strict(accept_kwargs=True) class ColModernVBertConfig(ColQwen2Config): r""" Example: @@ -50,39 +51,25 @@ class ColModernVBertConfig(ColQwen2Config): """ model_type = "colmodernvbert" - sub_configs: dict[str, Any] = {"vlm_config": PreTrainedConfig} + sub_configs = {"vlm_config": PreTrainedConfig} - def __init__( - self, - vlm_config=None, - embedding_dim: int = 128, - initializer_range: float = 0.02, - **kwargs, - ): - if vlm_config is None: - vlm_config = CONFIG_MAPPING["modernvbert"]() + vlm_config: dict | PreTrainedConfig | None = None + embedding_dim: int = 128 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.vlm_config is None: + self.vlm_config = CONFIG_MAPPING["modernvbert"]() logger.info( "`vlm_config` is `None`. Initializing `vlm_config` with the `ModernVBertConfig` with default values." ) - elif isinstance(vlm_config, dict): - vlm_config = deepcopy(vlm_config) - if "model_type" not in vlm_config: - raise KeyError( - "The `model_type` key is missing in the `vlm_config` dictionary. Please provide the model type." - ) - vlm_config = CONFIG_MAPPING[vlm_config["model_type"]](**vlm_config) - elif not isinstance(vlm_config, PreTrainedConfig): - raise TypeError( - f"Invalid type for `vlm_config`. Expected `PreTrainedConfig`, `dict`, or `None`, but got {type(vlm_config)}." - ) + elif isinstance(self.vlm_config, dict): + self.vlm_config = CONFIG_MAPPING[self.vlm_config["model_type"]](**self.vlm_config) - if not hasattr(vlm_config, "vocab_size"): - vlm_config.vocab_size = vlm_config.get_text_config().vocab_size + if not hasattr(self.vlm_config, "vocab_size"): + self.vlm_config.vocab_size = self.vlm_config.get_text_config().vocab_size - self.vlm_config = vlm_config - self.embedding_dim = embedding_dim - self.initializer_range = initializer_range - PreTrainedConfig.__init__(**kwargs) + super().__post_init__(**kwargs) class ColModernVBertProcessorKwargs(Idefics3ProcessorKwargs, total=False): diff --git a/src/transformers/models/colpali/configuration_colpali.py b/src/transformers/models/colpali/configuration_colpali.py index e3d46fe3c406..229744a8c81b 100644 --- a/src/transformers/models/colpali/configuration_colpali.py +++ b/src/transformers/models/colpali/configuration_colpali.py @@ -13,18 +13,18 @@ # limitations under the License. """ColPali model configuration""" -import logging -from copy import deepcopy +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring +from ...utils import auto_docstring, logging from ..auto import CONFIG_MAPPING, AutoConfig -logger = logging.getLogger(__name__) +logger = logging.get_logger(__name__) @auto_docstring(checkpoint="vidore/colpali-v1.2") +@strict(accept_kwargs=True) class ColPaliConfig(PreTrainedConfig): r""" Example: @@ -40,43 +40,25 @@ class ColPaliConfig(PreTrainedConfig): model_type = "colpali" sub_configs = {"vlm_config": PreTrainedConfig, "text_config": AutoConfig} - def __init__( - self, - vlm_config=None, - text_config=None, - embedding_dim: int = 128, - **kwargs, - ): - if vlm_config is None: - vlm_config = CONFIG_MAPPING["paligemma"]() + vlm_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + embedding_dim: int = 128 + + def __post_init__(self, **kwargs): + if self.vlm_config is None: + self.vlm_config = CONFIG_MAPPING["paligemma"]() logger.info( "`vlm_config` is `None`. Initializing `vlm_config` with the `PaliGemmaConfig` with default values." ) - elif isinstance(vlm_config, dict): - vlm_config = deepcopy(vlm_config) - if "model_type" not in vlm_config: - raise KeyError( - "The `model_type` key is missing in the `vlm_config` dictionary. Please provide the model type." - ) - elif vlm_config["model_type"] not in CONFIG_MAPPING: - raise ValueError( - f"The model type `{vlm_config['model_type']}` is not supported. Please provide a valid model type." - ) - vlm_config = CONFIG_MAPPING[vlm_config["model_type"]](**vlm_config) - elif not isinstance(vlm_config, PreTrainedConfig): - raise TypeError( - f"Invalid type for `vlm_config`. Expected `PreTrainedConfig`, `dict`, or `None`, but got {type(vlm_config)}." - ) + elif isinstance(self.vlm_config, dict): + self.vlm_config = CONFIG_MAPPING[self.vlm_config["model_type"]](**self.vlm_config) - self.vlm_config = vlm_config - self.text_config = text_config if text_config is not None else vlm_config.text_config + self.text_config = self.text_config if self.text_config is not None else self.vlm_config.text_config if isinstance(self.text_config, dict): - text_config["model_type"] = text_config.get("model_type", "gemma") - self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - - self.embedding_dim = embedding_dim + self.text_config["model_type"] = self.text_config.get("model_type", "gemma") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["ColPaliConfig"] diff --git a/src/transformers/models/colqwen2/configuration_colqwen2.py b/src/transformers/models/colqwen2/configuration_colqwen2.py index b83cca80c5f1..0f9db21ddd2e 100644 --- a/src/transformers/models/colqwen2/configuration_colqwen2.py +++ b/src/transformers/models/colqwen2/configuration_colqwen2.py @@ -13,8 +13,7 @@ # limitations under the License. -from copy import deepcopy -from typing import Any +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -25,6 +24,7 @@ @auto_docstring(checkpoint="vidore/colqwen2-v1.0-hf") +@strict(accept_kwargs=True) class ColQwen2Config(PreTrainedConfig): r""" Example: @@ -38,39 +38,25 @@ class ColQwen2Config(PreTrainedConfig): """ model_type = "colqwen2" - sub_configs: dict[str, Any] = {"vlm_config": PreTrainedConfig} - - def __init__( - self, - vlm_config=None, - embedding_dim: int = 128, - initializer_range: float = 0.02, - **kwargs, - ): - if vlm_config is None: - vlm_config = CONFIG_MAPPING["qwen2_vl"]() + sub_configs = {"vlm_config": PreTrainedConfig} + + vlm_config: dict | PreTrainedConfig | None = None + embedding_dim: int = 128 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.vlm_config is None: + self.vlm_config = CONFIG_MAPPING["qwen2_vl"]() logger.info( "`vlm_config` is `None`. Initializing `vlm_config` with the `Qwen2VLConfig` with default values." ) - elif isinstance(vlm_config, dict): - vlm_config = deepcopy(vlm_config) - if "model_type" not in vlm_config: - raise KeyError( - "The `model_type` key is missing in the `vlm_config` dictionary. Please provide the model type." - ) - vlm_config = CONFIG_MAPPING[vlm_config["model_type"]](**vlm_config) - elif not isinstance(vlm_config, PreTrainedConfig): - raise TypeError( - f"Invalid type for `vlm_config`. Expected `PreTrainedConfig`, `dict`, or `None`, but got {type(vlm_config)}." - ) + elif isinstance(self.vlm_config, dict): + self.vlm_config = CONFIG_MAPPING[self.vlm_config["model_type"]](**self.vlm_config) - if not hasattr(vlm_config, "vocab_size"): - vlm_config.vocab_size = vlm_config.get_text_config().vocab_size + if not hasattr(self.vlm_config, "vocab_size"): + self.vlm_config.vocab_size = self.vlm_config.get_text_config().vocab_size - self.vlm_config = vlm_config - self.embedding_dim = embedding_dim - self.initializer_range = initializer_range - super().__init__(**kwargs) + super().__post_init__(**kwargs) def get_text_config(self, *args, **kwargs) -> PreTrainedConfig: return self.vlm_config.get_text_config(*args, **kwargs) diff --git a/src/transformers/models/colqwen2/modeling_colqwen2.py b/src/transformers/models/colqwen2/modeling_colqwen2.py index df95aa5fbe53..593a54cc9386 100644 --- a/src/transformers/models/colqwen2/modeling_colqwen2.py +++ b/src/transformers/models/colqwen2/modeling_colqwen2.py @@ -157,7 +157,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs. if inputs_embeds is None: diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py index d28367a45857..24df8b0f184b 100644 --- a/src/transformers/models/colqwen2/modular_colqwen2.py +++ b/src/transformers/models/colqwen2/modular_colqwen2.py @@ -298,7 +298,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs. if inputs_embeds is None: diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py index f46e97bafd9c..86013e06c848 100644 --- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py +++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py @@ -13,16 +13,16 @@ # limitations under the License. """Conditional DETR model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="microsoft/conditional-detr-resnet-50") +@strict(accept_kwargs=True) class ConditionalDetrConfig(PreTrainedConfig): r""" auxiliary_loss (`bool`, *optional*, defaults to `False`): @@ -59,56 +59,55 @@ class ConditionalDetrConfig(PreTrainedConfig): attribute_map = { "hidden_size": "d_model", "num_attention_heads": "encoder_attention_heads", + "num_hidden_layers": "encoder_layers", } - def __init__( - self, - backbone_config=None, - num_channels=3, - num_queries=300, - encoder_layers=6, - encoder_ffn_dim=2048, - encoder_attention_heads=8, - decoder_layers=6, - decoder_ffn_dim=2048, - decoder_attention_heads=8, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - is_encoder_decoder=True, - activation_function="relu", - d_model=256, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - init_xavier_std=1.0, - auxiliary_loss=False, - position_embedding_type="sine", - dilation=False, - class_cost=2, - bbox_cost=5, - giou_cost=2, - mask_loss_coefficient=1, - dice_loss_coefficient=1, - cls_loss_coefficient=2, - bbox_loss_coefficient=5, - giou_loss_coefficient=2, - focal_alpha=0.25, - **kwargs, - ): + backbone_config: dict | PreTrainedConfig | None = None + num_channels: int = 3 + num_queries: int = 300 + encoder_layers: int = 6 + encoder_ffn_dim: int = 2048 + encoder_attention_heads: int = 8 + decoder_layers: int = 6 + decoder_ffn_dim: int = 2048 + decoder_attention_heads: int = 8 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + is_encoder_decoder: bool = True + activation_function: str = "relu" + d_model: int = 256 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + init_xavier_std: float = 1.0 + auxiliary_loss: bool = False + position_embedding_type: str = "sine" + dilation: bool = False + class_cost: int = 2 + bbox_cost: int = 5 + giou_cost: int = 2 + mask_loss_coefficient: int = 1 + dice_loss_coefficient: int = 1 + cls_loss_coefficient: int = 2 + bbox_loss_coefficient: int = 5 + giou_loss_coefficient: int = 2 + focal_alpha: float = 0.25 + + def __post_init__(self, **kwargs): # Init timm backbone with hardcoded values for BC backbone_kwargs = kwargs.get("backbone_kwargs", {}) timm_default_kwargs = { - "num_channels": backbone_kwargs.get("num_channels", num_channels), + "num_channels": backbone_kwargs.get("num_channels", self.num_channels), "features_only": True, "use_pretrained_backbone": False, "out_indices": backbone_kwargs.get("out_indices", [1, 2, 3, 4]), } - if dilation: + if self.dilation: timm_default_kwargs["output_stride"] = backbone_kwargs.get("output_stride", 16) - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_backbone="resnet50", default_config_type="resnet", default_config_kwargs={"out_features": ["stage4"]}, @@ -116,39 +115,7 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - self.num_channels = num_channels - self.num_queries = num_queries - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.init_xavier_std = init_xavier_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.num_hidden_layers = encoder_layers - self.auxiliary_loss = auxiliary_loss - self.position_embedding_type = position_embedding_type - # Hungarian matcher - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - # Loss coefficients - self.mask_loss_coefficient = mask_loss_coefficient - self.dice_loss_coefficient = dice_loss_coefficient - self.cls_loss_coefficient = cls_loss_coefficient - self.bbox_loss_coefficient = bbox_loss_coefficient - self.giou_loss_coefficient = giou_loss_coefficient - self.focal_alpha = focal_alpha - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__post_init__(**kwargs) __all__ = ["ConditionalDetrConfig"] diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index 2de83de19c12..44e1893c7b75 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -489,7 +489,7 @@ def __init__( config: ConditionalDetrConfig, hidden_size: int, num_attention_heads: int, - dropout: float = 0.0, + dropout: float | int = 0.0, ): super().__init__() self.config = config @@ -575,7 +575,7 @@ def __init__( config: ConditionalDetrConfig, hidden_size: int, num_attention_heads: int, - dropout: float = 0.0, + dropout: float | int = 0.0, ): super().__init__() self.config = config diff --git a/src/transformers/models/conditional_detr/modular_conditional_detr.py b/src/transformers/models/conditional_detr/modular_conditional_detr.py index c7d2c9f92f1b..1604182aa11d 100644 --- a/src/transformers/models/conditional_detr/modular_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modular_conditional_detr.py @@ -253,7 +253,7 @@ def __init__( config: ConditionalDetrConfig, hidden_size: int, num_attention_heads: int, - dropout: float = 0.0, + dropout: float | int = 0.0, ): super().__init__() self.config = config @@ -339,7 +339,7 @@ def __init__( config: ConditionalDetrConfig, hidden_size: int, num_attention_heads: int, - dropout: float = 0.0, + dropout: float | int = 0.0, ): super().__init__() self.config = config diff --git a/src/transformers/models/convbert/configuration_convbert.py b/src/transformers/models/convbert/configuration_convbert.py index be9298ff5c96..d07fc4cc93b0 100644 --- a/src/transformers/models/convbert/configuration_convbert.py +++ b/src/transformers/models/convbert/configuration_convbert.py @@ -13,14 +13,14 @@ # limitations under the License. """ConvBERT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="YituTech/conv-bert-base") +@strict(accept_kwargs=True) class ConvBertConfig(PreTrainedConfig): r""" head_ratio (`int`, *optional*, defaults to 2): @@ -45,58 +45,29 @@ class ConvBertConfig(PreTrainedConfig): model_type = "convbert" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - embedding_size=768, - head_ratio=2, - conv_kernel_size=9, - num_groups=1, - classifier_dropout=None, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.embedding_size = embedding_size - self.head_ratio = head_ratio - self.conv_kernel_size = conv_kernel_size - self.num_groups = num_groups - self.classifier_dropout = classifier_dropout + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + embedding_size: int = 768 + head_ratio: int = 2 + conv_kernel_size: int = 9 + num_groups: int = 1 + classifier_dropout: float | int | None = None + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["ConvBertConfig"] diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py index 55e1b158a08d..f343f340efcc 100644 --- a/src/transformers/models/convnext/configuration_convnext.py +++ b/src/transformers/models/convnext/configuration_convnext.py @@ -13,15 +13,15 @@ # limitations under the License. """ConvNeXT model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/convnext-tiny-224") +@strict(accept_kwargs=True) class ConvNextConfig(BackboneConfigMixin, PreTrainedConfig): r""" num_stages (`int`, *optional*, defaults to 4): @@ -43,38 +43,26 @@ class ConvNextConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "convnext" - def __init__( - self, - num_channels=3, - patch_size=4, - num_stages=4, - hidden_sizes=None, - depths=None, - hidden_act="gelu", - initializer_range=0.02, - layer_norm_eps=1e-12, - layer_scale_init_value=1e-6, - drop_path_rate=0.0, - image_size=224, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) + num_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 4 + num_stages: int = 4 + hidden_sizes: list[int] | tuple[int, ...] | None = (96, 192, 384, 768) + depths: list[int] | tuple[int, ...] | None = (3, 3, 9, 3) + hidden_act: str = "gelu" + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + layer_scale_init_value: float = 1e-6 + drop_path_rate: float = 0.0 + image_size: int | list[int] | tuple[int, int] = 224 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None - self.num_channels = num_channels - self.patch_size = patch_size - self.num_stages = num_stages - self.hidden_sizes = [96, 192, 384, 768] if hidden_sizes is None else hidden_sizes - self.depths = [3, 3, 9, 3] if depths is None else depths - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.layer_scale_init_value = layer_scale_init_value - self.drop_path_rate = drop_path_rate - self.image_size = image_size + def __post_init__(self, **kwargs): self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["ConvNextConfig"] diff --git a/src/transformers/models/convnextv2/configuration_convnextv2.py b/src/transformers/models/convnextv2/configuration_convnextv2.py index 33a854abad43..6d2c5d9aa1cc 100644 --- a/src/transformers/models/convnextv2/configuration_convnextv2.py +++ b/src/transformers/models/convnextv2/configuration_convnextv2.py @@ -13,15 +13,15 @@ # limitations under the License. """ConvNeXTV2 model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/convnextv2-tiny-1k-224") +@strict(accept_kwargs=True) class ConvNextV2Config(BackboneConfigMixin, PreTrainedConfig): r""" num_stages (`int`, *optional*, defaults to 4): @@ -43,36 +43,25 @@ class ConvNextV2Config(BackboneConfigMixin, PreTrainedConfig): model_type = "convnextv2" - def __init__( - self, - num_channels=3, - patch_size=4, - num_stages=4, - hidden_sizes=None, - depths=None, - hidden_act="gelu", - initializer_range=0.02, - layer_norm_eps=1e-12, - drop_path_rate=0.0, - image_size=224, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) + num_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 4 + num_stages: int = 4 + hidden_sizes: list[int] | tuple[int, ...] | None = (96, 192, 384, 768) + depths: list[int] | tuple[int, ...] | None = (3, 3, 9, 3) + hidden_act: str = "gelu" + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + drop_path_rate: float = 0.0 + image_size: int | list[int] | tuple[int, int] = 224 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None - self.num_channels = num_channels - self.patch_size = patch_size - self.num_stages = num_stages - self.hidden_sizes = [96, 192, 384, 768] if hidden_sizes is None else hidden_sizes - self.depths = [3, 3, 9, 3] if depths is None else depths - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.drop_path_rate = drop_path_rate - self.image_size = image_size + def __post_init__(self, **kwargs): self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["ConvNextV2Config"] diff --git a/src/transformers/models/cpmant/configuration_cpmant.py b/src/transformers/models/cpmant/configuration_cpmant.py index ff5833c7ae71..323890ba5d5c 100644 --- a/src/transformers/models/cpmant/configuration_cpmant.py +++ b/src/transformers/models/cpmant/configuration_cpmant.py @@ -13,14 +13,14 @@ # limitations under the License. """CPMAnt model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="openbmb/cpm-ant-10b") +@strict(accept_kwargs=True) class CpmAntConfig(PreTrainedConfig): r""" position_bias_num_buckets (`int`, *optional*, defaults to 512): @@ -52,43 +52,22 @@ class CpmAntConfig(PreTrainedConfig): model_type = "cpmant" - def __init__( - self, - vocab_size: int = 30720, - hidden_size: int = 4096, - num_attention_heads: int = 32, - dim_head: int = 128, - dim_ff: int = 10240, - num_hidden_layers: int = 48, - dropout_p: int = 0.0, - position_bias_num_buckets: int = 512, - position_bias_max_distance: int = 2048, - eps: int = 1e-6, - init_std: float = 1.0, - prompt_types: int = 32, - prompt_length: int = 32, - segment_types: int = 32, - use_cache: bool = True, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.tie_word_embeddings = tie_word_embeddings - self.prompt_types = prompt_types - self.prompt_length = prompt_length - self.segment_types = segment_types - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.dim_head = dim_head - self.dim_ff = dim_ff - self.num_hidden_layers = num_hidden_layers - self.position_bias_num_buckets = position_bias_num_buckets - self.position_bias_max_distance = position_bias_max_distance - self.dropout_p = dropout_p - self.eps = eps - self.use_cache = use_cache - self.vocab_size = vocab_size - self.init_std = init_std + vocab_size: int = 30720 + hidden_size: int = 4096 + num_attention_heads: int = 32 + dim_head: int = 128 + dim_ff: int = 10240 + num_hidden_layers: int = 48 + dropout_p: float = 0.0 + position_bias_num_buckets: int = 512 + position_bias_max_distance: int = 2048 + eps: float = 1e-6 + init_std: float = 1.0 + prompt_types: int = 32 + prompt_length: int = 32 + segment_types: int = 32 + use_cache: bool = True + tie_word_embeddings: bool = True __all__ = ["CpmAntConfig"] diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py index 17b50785d18d..7bc42f084315 100755 --- a/src/transformers/models/cpmant/modeling_cpmant.py +++ b/src/transformers/models/cpmant/modeling_cpmant.py @@ -598,7 +598,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict use_cache = use_cache if use_cache is not None else self.config.use_cache # add prompts ahead @@ -742,7 +742,7 @@ def forward( ['今天天气不错,阳光明媚,我和妈妈一起去超市买东西。\n在超市里,我看到了一个很好玩的玩具,它的名字叫“机器人”。它有一个圆圆的脑袋,两只圆圆的眼睛,还有一个圆圆的'] ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict model_output = self.cpmant( input_ids, diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index f68e40d06841..1baf07c7c7cf 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -13,6 +13,8 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="sesame/csm-1b") +@strict(accept_kwargs=True) class CsmDepthDecoderConfig(PreTrainedConfig): r""" backbone_hidden_size (`int`, *optional*, defaults to 2048): @@ -49,64 +52,41 @@ class CsmDepthDecoderConfig(PreTrainedConfig): } default_theta = 500000.0 - def __init__( - self, - num_codebooks: int | None = 32, - backbone_hidden_size: int | None = 2048, - vocab_size: int | None = 2051, - hidden_size: int | None = 1024, - intermediate_size: int | None = 8192, - num_hidden_layers: int | None = 4, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 2, - hidden_act: int | None = "silu", - max_position_embeddings: int | None = 33, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - head_dim: int | None = None, - **kwargs, - ): + num_codebooks: int | None = 32 + backbone_hidden_size: int = 2048 + vocab_size: int = 2051 + hidden_size: int = 1024 + intermediate_size: int = 8192 + num_hidden_layers: int = 4 + num_attention_heads: int = 8 + num_key_value_heads: int | None = 2 + hidden_act: str = "silu" + max_position_embeddings: int = 33 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + mlp_bias: bool = False + head_dim: int | None = None + + def __post_init__(self, **kwargs): if kwargs.pop("tie_word_embeddings", False): raise ValueError("`tie_word_embeddings=True` is not supported for CsmDepthDecoderConfig") - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.num_codebooks = num_codebooks - self.vocab_size = vocab_size - self.backbone_hidden_size = backbone_hidden_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + super().__post_init__(**kwargs) @auto_docstring(checkpoint="sesame/csm-1b") +@strict(accept_kwargs=True) class CsmConfig(PreTrainedConfig): r""" codebook_pad_token_id (`int`, *optional*, defaults to 2050): @@ -149,92 +129,57 @@ class CsmConfig(PreTrainedConfig): "codebook_size": "vocab_size", } - def __init__( - self, - num_codebooks: int | None = 32, - vocab_size: int | None = 2051, - text_vocab_size: int | None = 128256, - hidden_size: int | None = 2048, - intermediate_size: int | None = 8192, - num_hidden_layers: int | None = 16, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = 128002, - codebook_pad_token_id: int | None = 2050, - codebook_eos_token_id: int | None = 0, - bos_token_id: int | None = 128000, - eos_token_id: int | None = None, - audio_token_id: int | None = 128002, - audio_eos_token_id: int | None = 128003, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - head_dim: int | None = None, - tie_codebooks_embeddings: bool | None = True, - depth_decoder_config: dict | None = None, - codec_config: dict | None = None, - **kwargs, - ): + num_codebooks: int | None = 32 + vocab_size: int = 2051 + text_vocab_size: int = 128256 + hidden_size: int = 2048 + intermediate_size: int = 8192 + num_hidden_layers: int = 16 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = 128002 + codebook_pad_token_id: int | None = 2050 + codebook_eos_token_id: int | list[int] | None = 0 + bos_token_id: int | None = 128000 + eos_token_id: int | list[int] | None = None + audio_token_id: int | None = 128002 + audio_eos_token_id: int | list[int] | None = 128003 + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + mlp_bias: bool = False + head_dim: int | None = None + tie_codebooks_embeddings: bool | None = True + depth_decoder_config: dict | PreTrainedConfig | None = None + codec_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): if kwargs.pop("tie_word_embeddings", False): raise ValueError("`tie_word_embeddings=True` is not supported for CsmConfig") - if depth_decoder_config is None: + if self.depth_decoder_config is None: self.depth_decoder_config = CsmDepthDecoderConfig() logger.info("depth_decoder_config is None, using default depth decoder config.") - elif isinstance(depth_decoder_config, dict): - self.depth_decoder_config = CsmDepthDecoderConfig(**depth_decoder_config) - elif isinstance(depth_decoder_config, CsmDepthDecoderConfig): - self.depth_decoder_config = depth_decoder_config + elif isinstance(self.depth_decoder_config, dict): + self.depth_decoder_config = CsmDepthDecoderConfig(**self.depth_decoder_config) - if codec_config is None: + if self.codec_config is None: self.codec_config = AutoConfig.for_model("mimi") logger.info("codec_config is None, using default audio encoder config.") - elif isinstance(codec_config, dict): - self.codec_config = AutoConfig.for_model(**codec_config) - elif isinstance(codec_config, PreTrainedConfig): - self.codec_config = codec_config - - self.text_vocab_size = text_vocab_size - self.num_codebooks = num_codebooks - self.audio_token_id = audio_token_id - self.audio_eos_token_id = audio_eos_token_id - self.codebook_pad_token_id = codebook_pad_token_id - self.codebook_eos_token_id = codebook_eos_token_id - self.tie_codebooks_embeddings = tie_codebooks_embeddings - - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads + elif isinstance(self.codec_config, dict): + self.codec_config = AutoConfig.for_model(**self.codec_config) - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads self.tie_word_embeddings = False - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = [ diff --git a/src/transformers/models/ctrl/configuration_ctrl.py b/src/transformers/models/ctrl/configuration_ctrl.py index f6d4230457bb..3b6c885810d6 100644 --- a/src/transformers/models/ctrl/configuration_ctrl.py +++ b/src/transformers/models/ctrl/configuration_ctrl.py @@ -13,14 +13,14 @@ # limitations under the License. """Salesforce CTRL configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="Salesforce/ctrl") +@strict(accept_kwargs=True) class CTRLConfig(PreTrainedConfig): r""" dff (`int`, *optional*, defaults to 8192): @@ -54,43 +54,21 @@ class CTRLConfig(PreTrainedConfig): "num_hidden_layers": "n_layer", } - def __init__( - self, - vocab_size=246534, - n_positions=256, - n_embd=1280, - dff=8192, - n_layer=48, - n_head=16, - resid_pdrop=0.1, - embd_pdrop=0.1, - layer_norm_epsilon=1e-6, - initializer_range=0.02, - use_cache=True, - pad_token_id=None, - bos_token_id=None, - eos_token_id=None, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.dff = dff - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.use_cache = use_cache - - super().__init__(**kwargs) + vocab_size: int = 246534 + n_positions: int = 256 + n_embd: int = 1280 + dff: int = 8192 + n_layer: int = 48 + n_head: int = 16 + resid_pdrop: float = 0.1 + embd_pdrop: float = 0.1 + layer_norm_epsilon: float = 1e-6 + initializer_range: float = 0.02 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = True __all__ = ["CTRLConfig"] diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py index 5e0b819d6fa6..fd677ad37657 100644 --- a/src/transformers/models/ctrl/modeling_ctrl.py +++ b/src/transformers/models/ctrl/modeling_ctrl.py @@ -266,7 +266,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -435,7 +435,7 @@ def forward( >>> list(outputs.logits.shape) [1, 5, 246534] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -610,7 +610,7 @@ def forward( >>> loss.backward() # doctest: +IGNORE_RESULT ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, diff --git a/src/transformers/models/cvt/configuration_cvt.py b/src/transformers/models/cvt/configuration_cvt.py index 62e936eb101f..9cda2278064c 100644 --- a/src/transformers/models/cvt/configuration_cvt.py +++ b/src/transformers/models/cvt/configuration_cvt.py @@ -13,14 +13,14 @@ # limitations under the License. """CvT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/cvt-13") +@strict(accept_kwargs=True) class CvtConfig(PreTrainedConfig): r""" patch_padding (`list[int]`, *optional*, defaults to `[2, 1, 1]`): @@ -66,53 +66,27 @@ class CvtConfig(PreTrainedConfig): model_type = "cvt" - def __init__( - self, - num_channels=3, - patch_sizes=[7, 3, 3], - patch_stride=[4, 2, 2], - patch_padding=[2, 1, 1], - embed_dim=[64, 192, 384], - num_heads=[1, 3, 6], - depth=[1, 2, 10], - mlp_ratio=[4.0, 4.0, 4.0], - attention_drop_rate=[0.0, 0.0, 0.0], - drop_rate=[0.0, 0.0, 0.0], - drop_path_rate=[0.0, 0.0, 0.1], - qkv_bias=[True, True, True], - cls_token=[False, False, True], - qkv_projection_method=["dw_bn", "dw_bn", "dw_bn"], - kernel_qkv=[3, 3, 3], - padding_kv=[1, 1, 1], - stride_kv=[2, 2, 2], - padding_q=[1, 1, 1], - stride_q=[1, 1, 1], - initializer_range=0.02, - layer_norm_eps=1e-12, - **kwargs, - ): - super().__init__(**kwargs) - self.num_channels = num_channels - self.patch_sizes = patch_sizes - self.patch_stride = patch_stride - self.patch_padding = patch_padding - self.embed_dim = embed_dim - self.num_heads = num_heads - self.depth = depth - self.mlp_ratio = mlp_ratio - self.attention_drop_rate = attention_drop_rate - self.drop_rate = drop_rate - self.drop_path_rate = drop_path_rate - self.qkv_bias = qkv_bias - self.cls_token = cls_token - self.qkv_projection_method = qkv_projection_method - self.kernel_qkv = kernel_qkv - self.padding_kv = padding_kv - self.stride_kv = stride_kv - self.padding_q = padding_q - self.stride_q = stride_q - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps + num_channels: int = 3 + patch_sizes: list[int] | tuple[int, ...] = (7, 3, 3) + patch_stride: list[int] | tuple[int, ...] = (4, 2, 2) + patch_padding: list[int] | tuple[int, ...] = (2, 1, 1) + embed_dim: list[int] | tuple[int, ...] = (64, 192, 384) + num_heads: list[int] | tuple[int, ...] = (1, 3, 6) + depth: list[int] | tuple[int, ...] = (1, 2, 10) + mlp_ratio: list[float] | tuple[float, ...] = (4.0, 4.0, 4.0) + attention_drop_rate: list[float] | tuple[float, ...] = (0.0, 0.0, 0.0) + drop_rate: list[float] | tuple[float, ...] = (0.0, 0.0, 0.0) + drop_path_rate: list[float] | tuple[float, ...] = (0.0, 0.0, 0.1) + qkv_bias: list[bool] | tuple[bool, ...] = (True, True, True) + cls_token: list[bool] | tuple[bool, ...] = (False, False, True) + qkv_projection_method: list[str] | tuple[str, ...] = ("dw_bn", "dw_bn", "dw_bn") + kernel_qkv: list[int] | tuple[int, ...] = (3, 3, 3) + padding_kv: list[int] | tuple[int, ...] = (1, 1, 1) + stride_kv: list[int] | tuple[int, ...] = (2, 2, 2) + padding_q: list[int] | tuple[int, ...] = (1, 1, 1) + stride_q: list[int] | tuple[int, ...] = (1, 1, 1) + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 __all__ = ["CvtConfig"] diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py index 88de46f81591..3ceccb7d1dab 100644 --- a/src/transformers/models/cvt/modeling_cvt.py +++ b/src/transformers/models/cvt/modeling_cvt.py @@ -530,7 +530,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -588,7 +588,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.cvt( pixel_values, output_hidden_states=output_hidden_states, diff --git a/src/transformers/models/cwm/configuration_cwm.py b/src/transformers/models/cwm/configuration_cwm.py index 07e60f1d430f..83c0befde22a 100644 --- a/src/transformers/models/cwm/configuration_cwm.py +++ b/src/transformers/models/cwm/configuration_cwm.py @@ -19,11 +19,14 @@ # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/cwm") +@strict(accept_kwargs=True) class CwmConfig(PreTrainedConfig): r""" ```python @@ -56,37 +59,34 @@ class CwmConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + + vocab_size: int = 128256 + hidden_size: int = 6144 + intermediate_size: int = 21504 + num_hidden_layers: int = 64 + num_attention_heads: int = 48 + num_key_value_heads: int = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int = 128000 + eos_token_id: int | list[int] | None = None + pretraining_tp: int = 1 + tie_word_embeddings: bool = False + rope_parameters: dict | None = None + attention_dropout: float | int = 0.0 + mlp_bias: bool = False + head_dim: int = 128 default_theta = 1_000_000.0 + sliding_window: int = 8192 + layer_types: list[str] | None = None # ["full_attention"|"sliding_attention"] per layer - def __init__( - self, - vocab_size: int = 128256, - hidden_size: int = 6144, - intermediate_size: int = 21504, - num_hidden_layers: int = 64, - num_attention_heads: int = 48, - num_key_value_heads: int = 8, - head_dim: int = 128, - hidden_act: str = "silu", - max_position_embeddings: int = 131072, - initializer_range: float = 0.02, - rms_norm_eps: float = 1e-5, - use_cache: bool = True, - pad_token_id: int | None = None, - eos_token_id=[128001, 128008, 128009], - bos_token_id: int = 128000, - tie_word_embeddings: bool = False, - attention_dropout: float = 0.0, - pretraining_tp: int = 1, - mlp_bias: bool = False, - rope_parameters: dict | None = None, - # CWM interleaved sliding window fields - sliding_window: int = 8192, - layer_types: list[str] | None = None, # ["full_attention"|"sliding_attention"] per layer - **kwargs, - ): - if rope_parameters is None: - rope_parameters = { + def __post_init__(self, **kwargs): + if self.rope_parameters is None: + self.rope_parameters = { "rope_theta": 1_000_000.0, "factor": 16.0, "high_freq_factor": 4.0, @@ -95,45 +95,31 @@ def __init__( "rope_type": "llama3", } - if layer_types is None: + if self.layer_types is None: # Default pattern: every 4th layer uses full attention, others use sliding attention window_pattern = 4 - layer_types = [ + self.layer_types = [ ("full_attention" if (i % window_pattern == 0) else "sliding_attention") - for i in range(num_hidden_layers) + for i in range(self.num_hidden_layers) ] - else: - layer_type_validation(layer_types, num_hidden_layers) - - self.sliding_window = int(sliding_window) if sliding_window else None - self.layer_types = list(layer_types) - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + + self.sliding_window = int(self.sliding_window) if self.sliding_window else None + self.layer_types = list(self.layer_types) + self.eos_token_id = self.eos_token_id if self.eos_token_id is not None else [128001, 128008, 128009] + if self.head_dim is None: + self.head_dim = self.hidden_size // self.num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) __all__ = ["CwmConfig"] diff --git a/src/transformers/models/cwm/modular_cwm.py b/src/transformers/models/cwm/modular_cwm.py index 0f002052f002..a7d872fbfba4 100644 --- a/src/transformers/models/cwm/modular_cwm.py +++ b/src/transformers/models/cwm/modular_cwm.py @@ -14,9 +14,9 @@ import torch +from huggingface_hub.dataclasses import strict from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast from ...processing_utils import Unpack @@ -35,39 +35,39 @@ @auto_docstring(checkpoint="facebook/cwm") +@strict(accept_kwargs=True) class CwmConfig(LlamaConfig): model_type = "cwm" default_theta = 1_000_000.0 - def __init__( - self, - vocab_size: int = 128256, - hidden_size: int = 6144, - intermediate_size: int = 21504, - num_hidden_layers: int = 64, - num_attention_heads: int = 48, - num_key_value_heads: int = 8, - head_dim: int = 128, - hidden_act: str = "silu", - max_position_embeddings: int = 131072, - initializer_range: float = 0.02, - rms_norm_eps: float = 1e-5, - use_cache: bool = True, - pad_token_id: int | None = None, - eos_token_id=[128001, 128008, 128009], - bos_token_id: int = 128000, - tie_word_embeddings: bool = False, - attention_dropout: float = 0.0, - pretraining_tp: int = 1, - mlp_bias: bool = False, - rope_parameters: dict | None = None, - # CWM interleaved sliding window fields - sliding_window: int = 8192, - layer_types: list[str] | None = None, # ["full_attention"|"sliding_attention"] per layer - **kwargs, - ): - if rope_parameters is None: - rope_parameters = { + vocab_size: int = 128256 + hidden_size: int = 6144 + intermediate_size: int = 21504 + num_hidden_layers: int = 64 + num_attention_heads: int = 48 + num_key_value_heads: int = 8 + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + eos_token_id: int | list[int] | None = None + bos_token_id: int = 128000 + tie_word_embeddings: bool = False + attention_dropout: float | int = 0.0 + pretraining_tp: int = 1 + mlp_bias: bool = False + rope_parameters: dict | None = None + sliding_window: int = 8192 + layer_types: list[str] | None = None # ["full_attention"|"sliding_attention"] per layer + + attention_bias = AttributeError() + + def __post_init__(self, **kwargs): + if self.rope_parameters is None: + self.rope_parameters = { "rope_theta": 1_000_000.0, "factor": 16.0, "high_freq_factor": 4.0, @@ -76,46 +76,18 @@ def __init__( "rope_type": "llama3", } - if layer_types is None: + if self.layer_types is None: # Default pattern: every 4th layer uses full attention, others use sliding attention window_pattern = 4 - layer_types = [ + self.layer_types = [ ("full_attention" if (i % window_pattern == 0) else "sliding_attention") - for i in range(num_hidden_layers) + for i in range(self.num_hidden_layers) ] - else: - layer_type_validation(layer_types, num_hidden_layers) - - self.sliding_window = int(sliding_window) if sliding_window else None - self.layer_types = list(layer_types) - - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - head_dim=head_dim, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, - pad_token_id=pad_token_id, - eos_token_id=list(eos_token_id), - bos_token_id=bos_token_id, - tie_word_embeddings=tie_word_embeddings, - attention_bias=False, - attention_dropout=attention_dropout, - rope_parameters=rope_parameters, - pretraining_tp=pretraining_tp, - mlp_bias=mlp_bias, - **kwargs, - ) - # CWM models don't use attention bias, remove it from config - del self.attention_bias + self.sliding_window = int(self.sliding_window) if self.sliding_window else None + self.layer_types = list(self.layer_types) + self.eos_token_id = self.eos_token_id if self.eos_token_id is not None else [128001, 128008, 128009] + super().__post_init__(**kwargs) class CwmRotaryEmbedding(Qwen2RotaryEmbedding): diff --git a/src/transformers/models/d_fine/configuration_d_fine.py b/src/transformers/models/d_fine/configuration_d_fine.py index 64c2dae9d5f5..e531093d9eb3 100644 --- a/src/transformers/models/d_fine/configuration_d_fine.py +++ b/src/transformers/models/d_fine/configuration_d_fine.py @@ -17,6 +17,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -26,6 +28,7 @@ # TODO: Attribute map assignment logic should be fixed in modular # as well as super() call parsing because otherwise we cannot re-write args after initialization @auto_docstring(checkpoint="ustc-community/dfine-xlarge-coco") +@strict(accept_kwargs=True) class DFineConfig(PreTrainedConfig): """ initializer_bias_prior_prob (`float`, *optional*): @@ -144,169 +147,100 @@ class DFineConfig(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - initializer_range=0.01, - initializer_bias_prior_prob=None, - layer_norm_eps=1e-5, - batch_norm_eps=1e-5, - # backbone - backbone_config=None, - freeze_backbone_batch_norms=True, - # encoder HybridEncoder - encoder_hidden_dim=256, - encoder_in_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - encoder_layers=1, - encoder_ffn_dim=1024, - encoder_attention_heads=8, - dropout=0.0, - activation_dropout=0.0, - encode_proj_layers=[2], - positional_encoding_temperature=10000, - encoder_activation_function="gelu", - activation_function="silu", - eval_size=None, - normalize_before=False, - hidden_expansion=1.0, - # decoder DFineTransformer - d_model=256, - num_queries=300, - decoder_in_channels=[256, 256, 256], - decoder_ffn_dim=1024, - num_feature_levels=3, - decoder_n_points=4, - decoder_layers=6, - decoder_attention_heads=8, - decoder_activation_function="relu", - attention_dropout=0.0, - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0, - learn_initial_query=False, - anchor_image_size=None, - with_box_refine=True, - is_encoder_decoder=True, - # Loss - matcher_alpha=0.25, - matcher_gamma=2.0, - matcher_class_cost=2.0, - matcher_bbox_cost=5.0, - matcher_giou_cost=2.0, - use_focal_loss=True, - auxiliary_loss=True, - focal_loss_alpha=0.75, - focal_loss_gamma=2.0, - weight_loss_vfl=1.0, - weight_loss_bbox=5.0, - weight_loss_giou=2.0, - weight_loss_fgl=0.15, - weight_loss_ddf=1.5, - eos_coefficient=1e-4, - eval_idx=-1, - layer_scale=1, - max_num_bins=32, - reg_scale=4.0, - depth_mult=1.0, - top_prob_values=4, - lqe_hidden_dim=64, - lqe_layers=2, - decoder_offset_scale=0.5, - decoder_method="default", - up=0.5, - tie_word_embeddings=True, - **kwargs, - ): - self.initializer_range = initializer_range - self.initializer_bias_prior_prob = initializer_bias_prior_prob - self.layer_norm_eps = layer_norm_eps - self.batch_norm_eps = batch_norm_eps + initializer_range: float = 0.01 + initializer_bias_prior_prob: float | None = None + layer_norm_eps: float = 1e-5 + batch_norm_eps: float = 1e-5 + backbone_config: dict | PreTrainedConfig | None = None + freeze_backbone_batch_norms: bool = True + + # encoder HybridEncoder + encoder_hidden_dim: int = 256 + encoder_in_channels: list[int] | tuple[int, ...] = (512, 1024, 2048) + feat_strides: list[int] | tuple[int, ...] = (8, 16, 32) + encoder_layers: int = 1 + encoder_ffn_dim: int = 1024 + encoder_attention_heads: int = 8 + dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + encode_proj_layers: list[int] | tuple[int, ...] = (2,) + positional_encoding_temperature: int = 10000 + encoder_activation_function: str = "gelu" + activation_function: str = "silu" + eval_size: int | None = None + normalize_before: bool = False + hidden_expansion: float = 1.0 + + # decoder DFineTransformer + d_model: int = 256 + num_queries: int = 300 + decoder_in_channels: list[int] | tuple[int, ...] = (256, 256, 256) + decoder_ffn_dim: int = 1024 + num_feature_levels: int = 3 + decoder_n_points: int | list[int] = 4 + decoder_layers: int = 6 + decoder_attention_heads: int = 8 + decoder_activation_function: str = "relu" + attention_dropout: float | int = 0.0 + num_denoising: int = 100 + label_noise_ratio: float = 0.5 + box_noise_scale: float = 1.0 + learn_initial_query: bool = False + anchor_image_size: int | list[int] | None = None + with_box_refine: bool = True - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + # Loss + matcher_alpha: float = 0.25 + matcher_gamma: float = 2.0 + matcher_class_cost: float = 2.0 + matcher_bbox_cost: float = 5.0 + matcher_giou_cost: float = 2.0 + use_focal_loss: bool = True + auxiliary_loss: bool = True + focal_loss_alpha: float = 0.75 + focal_loss_gamma: float = 2.0 + weight_loss_vfl: float = 1.0 + weight_loss_bbox: float = 5.0 + weight_loss_giou: float = 2.0 + weight_loss_fgl: float = 0.15 + weight_loss_ddf: float = 1.5 + eos_coefficient: float = 1e-4 + eval_idx: int = -1 + layer_scale: int | float = 1.0 + max_num_bins: int = 32 + reg_scale: float = 4.0 + depth_mult: float = 1.0 + top_prob_values: int = 4 + lqe_hidden_dim: int = 64 + lqe_layers: int = 2 + decoder_offset_scale: float = 0.5 + decoder_method: str = "default" + up: float = 0.5 + tie_word_embeddings: bool = True + is_encoder_decoder: bool = True + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="hgnet_v2", default_config_kwargs={"out_indices": [2, 3, 4]}, **kwargs, ) + self.head_dim = self.d_model // self.decoder_attention_heads + super().__post_init__(**kwargs) - self.backbone_config = backbone_config - self.freeze_backbone_batch_norms = freeze_backbone_batch_norms - # encoder - self.encoder_hidden_dim = encoder_hidden_dim - self.encoder_in_channels = encoder_in_channels - self.feat_strides = feat_strides - self.encoder_attention_heads = encoder_attention_heads - self.encoder_ffn_dim = encoder_ffn_dim - self.dropout = dropout - self.activation_dropout = activation_dropout - self.encode_proj_layers = encode_proj_layers - self.encoder_layers = encoder_layers - self.positional_encoding_temperature = positional_encoding_temperature - self.eval_size = eval_size - self.normalize_before = normalize_before - self.encoder_activation_function = encoder_activation_function - self.activation_function = activation_function - self.hidden_expansion = hidden_expansion - # decoder - self.d_model = d_model - self.num_queries = num_queries - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_in_channels = decoder_in_channels - self.num_feature_levels = num_feature_levels - self.decoder_n_points = decoder_n_points - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.decoder_activation_function = decoder_activation_function - self.attention_dropout = attention_dropout - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.box_noise_scale = box_noise_scale - self.learn_initial_query = learn_initial_query - self.anchor_image_size = anchor_image_size - self.auxiliary_loss = auxiliary_loss - self.with_box_refine = with_box_refine - # Loss - self.matcher_alpha = matcher_alpha - self.matcher_gamma = matcher_gamma - self.matcher_class_cost = matcher_class_cost - self.matcher_bbox_cost = matcher_bbox_cost - self.matcher_giou_cost = matcher_giou_cost - self.use_focal_loss = use_focal_loss - self.focal_loss_alpha = focal_loss_alpha - self.focal_loss_gamma = focal_loss_gamma - self.weight_loss_vfl = weight_loss_vfl - self.weight_loss_bbox = weight_loss_bbox - self.weight_loss_giou = weight_loss_giou - self.weight_loss_fgl = weight_loss_fgl - self.weight_loss_ddf = weight_loss_ddf - self.eos_coefficient = eos_coefficient - # add the new attributes with the given values or defaults - self.eval_idx = eval_idx - self.layer_scale = layer_scale - self.max_num_bins = max_num_bins - self.reg_scale = reg_scale - self.depth_mult = depth_mult - self.decoder_offset_scale = decoder_offset_scale - self.decoder_method = decoder_method - self.top_prob_values = top_prob_values - self.lqe_hidden_dim = lqe_hidden_dim - self.lqe_layers = lqe_layers - self.up = up - self.tie_word_embeddings = tie_word_embeddings - + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if isinstance(self.decoder_n_points, list): if len(self.decoder_n_points) != self.num_feature_levels: raise ValueError( f"Length of decoder_n_points list ({len(self.decoder_n_points)}) must match num_feature_levels ({self.num_feature_levels})." ) - head_dim = self.d_model // self.decoder_attention_heads - if head_dim * self.decoder_attention_heads != self.d_model: + if self.head_dim * self.decoder_attention_heads != self.d_model: raise ValueError( f"Embedded dimension {self.d_model} must be divisible by decoder_attention_heads {self.decoder_attention_heads}" ) - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - __all__ = ["DFineConfig"] diff --git a/src/transformers/models/d_fine/modular_d_fine.py b/src/transformers/models/d_fine/modular_d_fine.py index b10468062529..1752d821828e 100644 --- a/src/transformers/models/d_fine/modular_d_fine.py +++ b/src/transformers/models/d_fine/modular_d_fine.py @@ -16,6 +16,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...activations import ACT2CLS @@ -50,6 +51,7 @@ # TODO: Attribute map assignment logic should be fixed in modular # as well as super() call parsing because otherwise we cannot re-write args after initialization @auto_docstring(checkpoint="ustc-community/dfine-xlarge-coco") +@strict(accept_kwargs=True) class DFineConfig(PreTrainedConfig): """ initializer_bias_prior_prob (`float`, *optional*): @@ -168,170 +170,101 @@ class DFineConfig(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - initializer_range=0.01, - initializer_bias_prior_prob=None, - layer_norm_eps=1e-5, - batch_norm_eps=1e-5, - # backbone - backbone_config=None, - freeze_backbone_batch_norms=True, - # encoder HybridEncoder - encoder_hidden_dim=256, - encoder_in_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - encoder_layers=1, - encoder_ffn_dim=1024, - encoder_attention_heads=8, - dropout=0.0, - activation_dropout=0.0, - encode_proj_layers=[2], - positional_encoding_temperature=10000, - encoder_activation_function="gelu", - activation_function="silu", - eval_size=None, - normalize_before=False, - hidden_expansion=1.0, - # decoder DFineTransformer - d_model=256, - num_queries=300, - decoder_in_channels=[256, 256, 256], - decoder_ffn_dim=1024, - num_feature_levels=3, - decoder_n_points=4, - decoder_layers=6, - decoder_attention_heads=8, - decoder_activation_function="relu", - attention_dropout=0.0, - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0, - learn_initial_query=False, - anchor_image_size=None, - with_box_refine=True, - is_encoder_decoder=True, - # Loss - matcher_alpha=0.25, - matcher_gamma=2.0, - matcher_class_cost=2.0, - matcher_bbox_cost=5.0, - matcher_giou_cost=2.0, - use_focal_loss=True, - auxiliary_loss=True, - focal_loss_alpha=0.75, - focal_loss_gamma=2.0, - weight_loss_vfl=1.0, - weight_loss_bbox=5.0, - weight_loss_giou=2.0, - weight_loss_fgl=0.15, - weight_loss_ddf=1.5, - eos_coefficient=1e-4, - eval_idx=-1, - layer_scale=1, - max_num_bins=32, - reg_scale=4.0, - depth_mult=1.0, - top_prob_values=4, - lqe_hidden_dim=64, - lqe_layers=2, - decoder_offset_scale=0.5, - decoder_method="default", - up=0.5, - tie_word_embeddings=True, - **kwargs, - ): - self.initializer_range = initializer_range - self.initializer_bias_prior_prob = initializer_bias_prior_prob - self.layer_norm_eps = layer_norm_eps - self.batch_norm_eps = batch_norm_eps - - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + initializer_range: float = 0.01 + initializer_bias_prior_prob: float | None = None + layer_norm_eps: float = 1e-5 + batch_norm_eps: float = 1e-5 + backbone_config: dict | PreTrainedConfig | None = None + freeze_backbone_batch_norms: bool = True + + # encoder HybridEncoder + encoder_hidden_dim: int = 256 + encoder_in_channels: list[int] | tuple[int, ...] = (512, 1024, 2048) + feat_strides: list[int] | tuple[int, ...] = (8, 16, 32) + encoder_layers: int = 1 + encoder_ffn_dim: int = 1024 + encoder_attention_heads: int = 8 + dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + encode_proj_layers: list[int] | tuple[int, ...] = (2,) + positional_encoding_temperature: int = 10000 + encoder_activation_function: str = "gelu" + activation_function: str = "silu" + eval_size: int | None = None + normalize_before: bool = False + hidden_expansion: float = 1.0 + + # decoder DFineTransformer + d_model: int = 256 + num_queries: int = 300 + decoder_in_channels: list[int] | tuple[int, ...] = (256, 256, 256) + decoder_ffn_dim: int = 1024 + num_feature_levels: int = 3 + decoder_n_points: int | list[int] = 4 + decoder_layers: int = 6 + decoder_attention_heads: int = 8 + decoder_activation_function: str = "relu" + attention_dropout: float | int = 0.0 + num_denoising: int = 100 + label_noise_ratio: float = 0.5 + box_noise_scale: float = 1.0 + learn_initial_query: bool = False + anchor_image_size: int | list[int] | None = None + with_box_refine: bool = True + + # Loss + matcher_alpha: float = 0.25 + matcher_gamma: float = 2.0 + matcher_class_cost: float = 2.0 + matcher_bbox_cost: float = 5.0 + matcher_giou_cost: float = 2.0 + use_focal_loss: bool = True + auxiliary_loss: bool = True + focal_loss_alpha: float = 0.75 + focal_loss_gamma: float = 2.0 + weight_loss_vfl: float = 1.0 + weight_loss_bbox: float = 5.0 + weight_loss_giou: float = 2.0 + weight_loss_fgl: float = 0.15 + weight_loss_ddf: float = 1.5 + eos_coefficient: float = 1e-4 + eval_idx: int = -1 + layer_scale: int | float = 1.0 + max_num_bins: int = 32 + reg_scale: float = 4.0 + depth_mult: float = 1.0 + top_prob_values: int = 4 + lqe_hidden_dim: int = 64 + lqe_layers: int = 2 + decoder_offset_scale: float = 0.5 + decoder_method: str = "default" + up: float = 0.5 + tie_word_embeddings: bool = True + is_encoder_decoder: bool = True + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="hgnet_v2", default_config_kwargs={"out_indices": [2, 3, 4]}, **kwargs, ) + self.head_dim = self.d_model // self.decoder_attention_heads + super().__post_init__(**kwargs) - self.backbone_config = backbone_config - self.freeze_backbone_batch_norms = freeze_backbone_batch_norms - # encoder - self.encoder_hidden_dim = encoder_hidden_dim - self.encoder_in_channels = encoder_in_channels - self.feat_strides = feat_strides - self.encoder_attention_heads = encoder_attention_heads - self.encoder_ffn_dim = encoder_ffn_dim - self.dropout = dropout - self.activation_dropout = activation_dropout - self.encode_proj_layers = encode_proj_layers - self.encoder_layers = encoder_layers - self.positional_encoding_temperature = positional_encoding_temperature - self.eval_size = eval_size - self.normalize_before = normalize_before - self.encoder_activation_function = encoder_activation_function - self.activation_function = activation_function - self.hidden_expansion = hidden_expansion - # decoder - self.d_model = d_model - self.num_queries = num_queries - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_in_channels = decoder_in_channels - self.num_feature_levels = num_feature_levels - self.decoder_n_points = decoder_n_points - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.decoder_activation_function = decoder_activation_function - self.attention_dropout = attention_dropout - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.box_noise_scale = box_noise_scale - self.learn_initial_query = learn_initial_query - self.anchor_image_size = anchor_image_size - self.auxiliary_loss = auxiliary_loss - self.with_box_refine = with_box_refine - # Loss - self.matcher_alpha = matcher_alpha - self.matcher_gamma = matcher_gamma - self.matcher_class_cost = matcher_class_cost - self.matcher_bbox_cost = matcher_bbox_cost - self.matcher_giou_cost = matcher_giou_cost - self.use_focal_loss = use_focal_loss - self.focal_loss_alpha = focal_loss_alpha - self.focal_loss_gamma = focal_loss_gamma - self.weight_loss_vfl = weight_loss_vfl - self.weight_loss_bbox = weight_loss_bbox - self.weight_loss_giou = weight_loss_giou - self.weight_loss_fgl = weight_loss_fgl - self.weight_loss_ddf = weight_loss_ddf - self.eos_coefficient = eos_coefficient - # add the new attributes with the given values or defaults - self.eval_idx = eval_idx - self.layer_scale = layer_scale - self.max_num_bins = max_num_bins - self.reg_scale = reg_scale - self.depth_mult = depth_mult - self.decoder_offset_scale = decoder_offset_scale - self.decoder_method = decoder_method - self.top_prob_values = top_prob_values - self.lqe_hidden_dim = lqe_hidden_dim - self.lqe_layers = lqe_layers - self.up = up - self.tie_word_embeddings = tie_word_embeddings - + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if isinstance(self.decoder_n_points, list): if len(self.decoder_n_points) != self.num_feature_levels: raise ValueError( f"Length of decoder_n_points list ({len(self.decoder_n_points)}) must match num_feature_levels ({self.num_feature_levels})." ) - head_dim = self.d_model // self.decoder_attention_heads - if head_dim * self.decoder_attention_heads != self.d_model: + if self.head_dim * self.decoder_attention_heads != self.d_model: raise ValueError( f"Embedded dimension {self.d_model} must be divisible by decoder_attention_heads {self.decoder_attention_heads}" ) - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - class DFineDecoderOutput(RTDetrDecoderOutput): pass diff --git a/src/transformers/models/dab_detr/configuration_dab_detr.py b/src/transformers/models/dab_detr/configuration_dab_detr.py index 3ad7fee61efa..7cd1c1a7e7e3 100644 --- a/src/transformers/models/dab_detr/configuration_dab_detr.py +++ b/src/transformers/models/dab_detr/configuration_dab_detr.py @@ -13,16 +13,16 @@ # limitations under the License. """DAB-DETR model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="IDEA-Research/dab-detr-resnet-50") +@strict(accept_kwargs=True) class DabDetrConfig(PreTrainedConfig): r""" num_queries (`int`, *optional*, defaults to 300): @@ -50,7 +50,6 @@ class DabDetrConfig(PreTrainedConfig): The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`. If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights. - Examples: ```python @@ -71,50 +70,46 @@ class DabDetrConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] attribute_map = { "num_attention_heads": "encoder_attention_heads", + "num_hidden_layers": "encoder_layers", } - def __init__( - self, - backbone_config=None, - num_queries=300, - encoder_layers=6, - encoder_ffn_dim=2048, - encoder_attention_heads=8, - decoder_layers=6, - decoder_ffn_dim=2048, - decoder_attention_heads=8, - is_encoder_decoder=True, - activation_function="prelu", - hidden_size=256, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - init_xavier_std=1.0, - auxiliary_loss=False, - dilation=False, - class_cost=2, - bbox_cost=5, - giou_cost=2, - cls_loss_coefficient=2, - bbox_loss_coefficient=5, - giou_loss_coefficient=2, - focal_alpha=0.25, - temperature_height=20, - temperature_width=20, - query_dim=4, - random_refpoints_xy=False, - keep_query_pos=False, - num_patterns=0, - normalize_before=False, - sine_position_embedding_scale=None, - initializer_bias_prior_prob=None, - tie_word_embeddings=True, - **kwargs, - ): - if query_dim != 4: - raise ValueError("The query dimensions has to be 4.") - + backbone_config: dict | PreTrainedConfig | None = None + num_queries: int = 300 + encoder_layers: int = 6 + encoder_ffn_dim: int = 2048 + encoder_attention_heads: int = 8 + decoder_layers: int = 6 + decoder_ffn_dim: int = 2048 + decoder_attention_heads: int = 8 + is_encoder_decoder: int = True + activation_function: str = "prelu" + hidden_size: int = 256 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + init_xavier_std: float = 1.0 + auxiliary_loss: bool = False + dilation: bool = False + class_cost: int = 2 + bbox_cost: int = 5 + giou_cost: int = 2 + cls_loss_coefficient: int = 2 + bbox_loss_coefficient: int = 5 + giou_loss_coefficient: int = 2 + focal_alpha: float = 0.25 + temperature_height: int = 20 + temperature_width: int = 20 + query_dim: int = 4 + random_refpoints_xy: bool = False + keep_query_pos: bool = False + num_patterns: int = 0 + normalize_before: bool = False + sine_position_embedding_scale: float | None = None + initializer_bias_prior_prob: float | None = None + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): # Init timm backbone with hardcoded values for BC timm_default_kwargs = { "num_channels": 3, @@ -122,11 +117,11 @@ def __init__( "use_pretrained_backbone": False, "out_indices": [1, 2, 3, 4], } - if dilation: + if self.dilation: timm_default_kwargs["output_stride"] = 16 - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_backbone="resnet50", default_config_type="resnet50", default_config_kwargs={"out_features": ["stage4"]}, @@ -134,44 +129,12 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - self.num_queries = num_queries - self.hidden_size = hidden_size - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.init_xavier_std = init_xavier_std - self.num_hidden_layers = encoder_layers - self.auxiliary_loss = auxiliary_loss - # Hungarian matcher - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - # Loss coefficients - self.cls_loss_coefficient = cls_loss_coefficient - self.bbox_loss_coefficient = bbox_loss_coefficient - self.giou_loss_coefficient = giou_loss_coefficient - self.focal_alpha = focal_alpha - self.query_dim = query_dim - self.random_refpoints_xy = random_refpoints_xy - self.keep_query_pos = keep_query_pos - self.num_patterns = num_patterns - self.normalize_before = normalize_before - self.temperature_width = temperature_width - self.temperature_height = temperature_height - self.sine_position_embedding_scale = sine_position_embedding_scale - self.initializer_bias_prior_prob = initializer_bias_prior_prob - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.query_dim != 4: + raise ValueError("The query dimensions has to be 4.") __all__ = ["DabDetrConfig"] diff --git a/src/transformers/models/dab_detr/modeling_dab_detr.py b/src/transformers/models/dab_detr/modeling_dab_detr.py index 933b2175d2cf..d8fdae89fdbc 100644 --- a/src/transformers/models/dab_detr/modeling_dab_detr.py +++ b/src/transformers/models/dab_detr/modeling_dab_detr.py @@ -915,7 +915,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict hidden_states = inputs_embeds @@ -1045,7 +1045,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if inputs_embeds is not None: hidden_states = inputs_embeds @@ -1267,7 +1267,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict batch_size, _, height, width = pixel_values.shape device = pixel_values.device @@ -1530,7 +1530,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # First, sent images through DAB_DETR base model to obtain encoder + decoder outputs model_outputs = self.model( diff --git a/src/transformers/models/dac/configuration_dac.py b/src/transformers/models/dac/configuration_dac.py index 551c1964bebc..51d3d9e1a979 100644 --- a/src/transformers/models/dac/configuration_dac.py +++ b/src/transformers/models/dac/configuration_dac.py @@ -16,15 +16,14 @@ import math import numpy as np +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="descript/dac_16khz") +@strict(accept_kwargs=True) class DacConfig(PreTrainedConfig): r""" downsampling_ratios (`list[int]`, *optional*, defaults to `[2, 4, 8, 8]`): @@ -53,37 +52,22 @@ class DacConfig(PreTrainedConfig): model_type = "dac" - def __init__( - self, - encoder_hidden_size=64, - downsampling_ratios=[2, 4, 8, 8], - decoder_hidden_size=1536, - n_codebooks=9, - codebook_size=1024, - codebook_dim=8, - quantizer_dropout=0, - commitment_loss_weight=0.25, - codebook_loss_weight=1.0, - sampling_rate=16000, - **kwargs, - ): - self.encoder_hidden_size = encoder_hidden_size - self.downsampling_ratios = downsampling_ratios - self.decoder_hidden_size = decoder_hidden_size - self.upsampling_ratios = downsampling_ratios[::-1] - self.n_codebooks = n_codebooks - self.codebook_size = codebook_size - self.codebook_dim = codebook_dim - self.quantizer_dropout = quantizer_dropout - self.sampling_rate = sampling_rate - - self.hidden_size = encoder_hidden_size * (2 ** len(downsampling_ratios)) - - self.hop_length = int(np.prod(downsampling_ratios)) - self.commitment_loss_weight = commitment_loss_weight - self.codebook_loss_weight = codebook_loss_weight - - super().__init__(**kwargs) + encoder_hidden_size: int = 64 + downsampling_ratios: list[int] | tuple[int, ...] = (2, 4, 8, 8) + decoder_hidden_size: int = 1536 + n_codebooks: int = 9 + codebook_size: int = 1024 + codebook_dim: int = 8 + quantizer_dropout: float | int = 0.0 + commitment_loss_weight: float = 0.25 + codebook_loss_weight: float = 1.0 + sampling_rate: int = 16000 + + def __post_init__(self, **kwargs): + self.upsampling_ratios = self.downsampling_ratios[::-1] + self.hidden_size = self.encoder_hidden_size * (2 ** len(self.downsampling_ratios)) + self.hop_length = int(np.prod(self.downsampling_ratios)) + super().__post_init__(**kwargs) @property def frame_rate(self) -> int: diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py index db017247453f..048c5348e189 100644 --- a/src/transformers/models/data2vec/configuration_data2vec_audio.py +++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py @@ -15,14 +15,14 @@ import math -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/data2vec-audio-base-960h") +@strict(accept_kwargs=True) class Data2VecAudioConfig(PreTrainedConfig): r""" final_dropout (`float`, *optional*, defaults to 0.1): @@ -125,83 +125,58 @@ class Data2VecAudioConfig(PreTrainedConfig): model_type = "data2vec-audio" - def __init__( - self, - vocab_size=32, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - feat_proj_dropout=0.0, - final_dropout=0.1, - layerdrop=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - feat_extract_activation="gelu", - conv_dim=(512, 512, 512, 512, 512, 512, 512), - conv_stride=(5, 2, 2, 2, 2, 2, 2), - conv_kernel=(10, 3, 3, 3, 3, 2, 2), - conv_bias=False, - num_conv_pos_embedding_groups=16, - conv_pos_kernel_size=19, - num_conv_pos_embeddings=5, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - mask_feature_min_masks=0, - ctc_loss_reduction="sum", - ctc_zero_infinity=False, - use_weighted_layer_sum=False, - classifier_proj_size=256, - tdnn_dim=(512, 512, 512, 512, 1500), - tdnn_kernel=(5, 3, 3, 1, 1), - tdnn_dilation=(1, 2, 3, 1, 1), - xvector_output_dim=512, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - add_adapter=False, - adapter_kernel_size=3, - adapter_stride=2, - num_adapter_layers=3, - output_hidden_size=None, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.hidden_size = hidden_size - self.feat_extract_activation = feat_extract_activation - self.conv_dim = list(conv_dim) - self.conv_stride = list(conv_stride) - self.conv_kernel = list(conv_kernel) - self.conv_bias = conv_bias - self.num_conv_pos_embeddings = num_conv_pos_embeddings - self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups - self.conv_pos_kernel_size = conv_pos_kernel_size + vocab_size: int = 32 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + feat_proj_dropout: float | int = 0.0 + final_dropout: float | int = 0.1 + layerdrop: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + feat_extract_activation: str = "gelu" + conv_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 512, 512, 512) + conv_stride: list[int] | tuple[int, ...] = (5, 2, 2, 2, 2, 2, 2) + conv_kernel: list[int] | tuple[int, ...] = (10, 3, 3, 3, 3, 2, 2) + conv_bias: bool = False + num_conv_pos_embedding_groups: int = 16 + conv_pos_kernel_size: int = 19 + num_conv_pos_embeddings: int = 5 + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + mask_feature_min_masks: int = 0 + ctc_loss_reduction: str = "sum" + ctc_zero_infinity: bool = False + use_weighted_layer_sum: bool = False + classifier_proj_size: int = 256 + tdnn_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 1500) + tdnn_kernel: list[int] | tuple[int, ...] = (5, 3, 3, 1, 1) + tdnn_dilation: list[int] | tuple[int, ...] = (1, 2, 3, 1, 1) + xvector_output_dim: int = 512 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + add_adapter: bool = False + adapter_kernel_size: int = 3 + adapter_stride: int = 2 + num_adapter_layers: int = 3 + output_hidden_size: int | None = None + + def __post_init__(self, **kwargs): + self.output_hidden_size = self.output_hidden_size or self.hidden_size self.num_feat_extract_layers = len(self.conv_dim) - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_attention_heads = num_attention_heads - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.feat_proj_dropout = feat_proj_dropout - self.final_dropout = final_dropout - self.layerdrop = layerdrop - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.vocab_size = vocab_size - self.use_weighted_layer_sum = use_weighted_layer_sum + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if ( (len(self.conv_stride) != self.num_feat_extract_layers) or (len(self.conv_kernel) != self.num_feat_extract_layers) @@ -214,34 +189,6 @@ def __init__( f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." ) - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - self.mask_feature_min_masks = mask_feature_min_masks - - # ctc loss - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - - # adapter - self.add_adapter = add_adapter - self.adapter_kernel_size = adapter_kernel_size - self.adapter_stride = adapter_stride - self.num_adapter_layers = num_adapter_layers - self.output_hidden_size = output_hidden_size or hidden_size - - # SequenceClassification-specific parameter. Feel free to ignore for other classes. - self.classifier_proj_size = classifier_proj_size - - # XVector-specific parameters. Feel free to ignore for other classes. - self.tdnn_dim = list(tdnn_dim) - self.tdnn_kernel = list(tdnn_kernel) - self.tdnn_dilation = list(tdnn_dilation) - self.xvector_output_dim = xvector_output_dim - @property def inputs_to_logits_ratio(self): return math.prod(self.conv_stride) diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py index 13dad59f5d40..cb5284bc40dd 100644 --- a/src/transformers/models/data2vec/configuration_data2vec_text.py +++ b/src/transformers/models/data2vec/configuration_data2vec_text.py @@ -13,14 +13,14 @@ # limitations under the License. """Data2VecText configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/data2vec-text-base") +@strict(accept_kwargs=True) class Data2VecTextConfig(PreTrainedConfig): r""" Examples: @@ -40,52 +40,26 @@ class Data2VecTextConfig(PreTrainedConfig): model_type = "data2vec-text" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - use_cache=True, - classifier_dropout=None, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + use_cache: bool = True + classifier_dropout: float | int | None = None + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["Data2VecTextConfig"] diff --git a/src/transformers/models/data2vec/configuration_data2vec_vision.py b/src/transformers/models/data2vec/configuration_data2vec_vision.py index babeb9a284b5..cbbbd6c2c3ec 100644 --- a/src/transformers/models/data2vec/configuration_data2vec_vision.py +++ b/src/transformers/models/data2vec/configuration_data2vec_vision.py @@ -13,14 +13,14 @@ # limitations under the License. """Data2VecVision model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/data2vec-vision-base") +@strict(accept_kwargs=True) class Data2VecVisionConfig(PreTrainedConfig): r""" use_mask_token (`bool`, *optional*, defaults to `False`): @@ -60,69 +60,33 @@ class Data2VecVisionConfig(PreTrainedConfig): model_type = "data2vec-vision" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - image_size=224, - patch_size=16, - num_channels=3, - use_mask_token=False, - use_absolute_position_embeddings=False, - use_relative_position_bias=False, - use_shared_relative_position_bias=False, - layer_scale_init_value=0.1, - drop_path_rate=0.1, - use_mean_pooling=True, - out_indices=[3, 5, 7, 11], - pool_scales=[1, 2, 3, 6], - use_auxiliary_head=True, - auxiliary_loss_weight=0.4, - auxiliary_channels=256, - auxiliary_num_convs=1, - auxiliary_concat_input=False, - semantic_loss_ignore_index=255, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.use_mask_token = use_mask_token - self.use_absolute_position_embeddings = use_absolute_position_embeddings - self.use_relative_position_bias = use_relative_position_bias - self.use_shared_relative_position_bias = use_shared_relative_position_bias - self.layer_scale_init_value = layer_scale_init_value - self.drop_path_rate = drop_path_rate - self.use_mean_pooling = use_mean_pooling - # decode head attributes (semantic segmentation) - self.out_indices = out_indices - self.pool_scales = pool_scales - # auxiliary head attributes (semantic segmentation) - self.use_auxiliary_head = use_auxiliary_head - self.auxiliary_loss_weight = auxiliary_loss_weight - self.auxiliary_channels = auxiliary_channels - self.auxiliary_num_convs = auxiliary_num_convs - self.auxiliary_concat_input = auxiliary_concat_input - self.semantic_loss_ignore_index = semantic_loss_ignore_index + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + use_mask_token: bool = False + use_absolute_position_embeddings: bool = False + use_relative_position_bias: bool = False + use_shared_relative_position_bias: bool = False + layer_scale_init_value: float = 0.1 + drop_path_rate: float = 0.1 + use_mean_pooling: bool = True + out_indices: list[int] | tuple[int, ...] = (3, 5, 7, 11) + pool_scales: list[int] | tuple[int, ...] = (1, 2, 3, 6) + use_auxiliary_head: bool = True + auxiliary_loss_weight: float = 0.4 + auxiliary_channels: int = 256 + auxiliary_num_convs: int = 1 + auxiliary_concat_input: bool = False + semantic_loss_ignore_index: int = 255 __all__ = ["Data2VecVisionConfig"] diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index a61a44bd9401..180a75757f56 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -760,7 +760,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) @@ -861,7 +861,7 @@ def forward( All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -979,7 +979,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.data2vec_audio( @@ -1083,7 +1083,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.data2vec_audio( @@ -1255,7 +1255,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.data2vec_audio( diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index b5a3e84b887e..8cac4be4da14 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -750,7 +750,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) resolution = pixel_values.shape[2:] @@ -836,7 +836,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.data2vec_vision( pixel_values, output_attentions=output_attentions, @@ -1201,7 +1201,7 @@ def forward( >>> # logits are of shape (batch_size, num_labels, height, width) >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index f16b628d11f7..269a9696ec46 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -13,16 +13,14 @@ # limitations under the License. """DBRX model configuration""" -from typing import Any +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring +@strict(accept_kwargs=True) @auto_docstring( custom_intro="This config is used to instantiate attention layers.", checkpoint="transformers-community/dbrx-instruct", @@ -39,19 +37,12 @@ class DbrxAttentionConfig(PreTrainedConfig): base_config_key = "attn_config" - def __init__( - self, - attn_pdrop: float = 0.0, - clip_qkv: float | None = None, - kv_n_heads: int = 1, - **kwargs: Any, - ): - super().__init__(**kwargs) - self.attn_pdrop = attn_pdrop - self.clip_qkv = clip_qkv - self.kv_n_heads = kv_n_heads + attn_pdrop: float = 0.0 + clip_qkv: int | float | None = None + kv_n_heads: int = 1 +@strict(accept_kwargs=True) @auto_docstring( custom_intro="This config is used to instantiate feedforward layers.", checkpoint="transformers-community/dbrx-instruct", @@ -77,29 +68,18 @@ class DbrxFFNConfig(PreTrainedConfig): base_config_key = "ffn_config" - def __init__( - self, - hidden_size=6144, - ffn_act_fn: dict | None = None, - ffn_hidden_size: int = 3584, - moe_num_experts: int = 4, - moe_top_k: int = 1, - moe_jitter_eps: float | None = None, - moe_loss_weight: float = 0.01, - moe_normalize_expert_weights: float | None = 1.0, - **kwargs: Any, - ): - super().__init__() - if ffn_act_fn is None: - ffn_act_fn = {"name": "silu"} - self.hidden_size = hidden_size - self.ffn_act_fn = ffn_act_fn - self.ffn_hidden_size = ffn_hidden_size - self.moe_num_experts = moe_num_experts - self.moe_top_k = moe_top_k - self.moe_jitter_eps = moe_jitter_eps - self.moe_loss_weight = moe_loss_weight - self.moe_normalize_expert_weights = moe_normalize_expert_weights + hidden_size: int = 6144 + ffn_act_fn: dict | None = None + ffn_hidden_size: int = 3584 + moe_num_experts: int = 4 + moe_top_k: int = 1 + moe_jitter_eps: float | None = None + moe_loss_weight: float = 0.01 + moe_normalize_expert_weights: float | None = 1.0 + + def __post_init__(self, **kwargs): + if self.ffn_act_fn is None: + self.ffn_act_fn = {"name": "silu"} for k in [ "model_type", @@ -115,8 +95,11 @@ def __init__( if len(kwargs) != 0: raise ValueError(f"Found unknown {kwargs=}") + super().__post_init__(**kwargs) + @auto_docstring(checkpoint="transformers-community/dbrx-instruct") +@strict(accept_kwargs=True) class DbrxConfig(PreTrainedConfig): r""" max_seq_len (`int`, *optional*, defaults to 2048): @@ -150,62 +133,42 @@ class DbrxConfig(PreTrainedConfig): "max_position_embeddings": "max_seq_len", } - def __init__( - self, - d_model: int | None = 2048, - n_heads: int | None = 16, - n_layers: int | None = 24, - max_seq_len: int | None = 2048, - vocab_size: int | None = 32000, - resid_pdrop: float | None = 0.0, - emb_pdrop: float | None = 0.0, - attn_config: DbrxAttentionConfig | None = None, - ffn_config: DbrxFFNConfig | None = None, - use_cache: bool | None = True, - initializer_range: float | None = 0.02, - output_router_logits: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - tie_word_embeddings: bool | None = False, - **kwargs: Any, - ): - if attn_config is None: + d_model: int | None = 2048 + n_heads: int | None = 16 + n_layers: int | None = 24 + max_seq_len: int | None = 2048 + vocab_size: int = 32000 + resid_pdrop: float | None = 0.0 + emb_pdrop: float | None = 0.0 + attn_config: DbrxAttentionConfig | dict | None = None + ffn_config: DbrxFFNConfig | dict | None = None + use_cache: bool = True + initializer_range: float = 0.02 + output_router_logits: bool | None = False + rope_parameters: RopeParameters | dict | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if self.attn_config is None: self.attn_config = DbrxAttentionConfig() - elif isinstance(attn_config, dict): - self.attn_config = DbrxAttentionConfig(**attn_config) - else: - self.attn_config = attn_config + elif isinstance(self.attn_config, dict): + self.attn_config = DbrxAttentionConfig(**self.attn_config) - if ffn_config is None: + if self.ffn_config is None: self.ffn_config = DbrxFFNConfig() - elif isinstance(ffn_config, dict): - self.ffn_config = DbrxFFNConfig(**ffn_config) - else: - self.ffn_config = ffn_config - - self.d_model = d_model - self.n_heads = n_heads - self.n_layers = n_layers - self.max_seq_len = max_seq_len - self.vocab_size = vocab_size - self.resid_pdrop = resid_pdrop - self.emb_pdrop = emb_pdrop - self.use_cache = use_cache - self.initializer_range = initializer_range - self.output_router_logits = output_router_logits - self.num_key_value_heads = self.attn_config.kv_n_heads - if tie_word_embeddings: - raise ValueError("tie_word_embeddings is not supported for DBRX models.") + elif isinstance(self.ffn_config, dict): + self.ffn_config = DbrxFFNConfig(**self.ffn_config) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.rope_parameters = rope_parameters + self.num_key_value_heads = self.attn_config.kv_n_heads + super().__post_init__(**kwargs) - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.tie_word_embeddings: + raise ValueError("tie_word_embeddings is not supported for DBRX models.") __all__ = ["DbrxConfig"] diff --git a/src/transformers/models/deberta/configuration_deberta.py b/src/transformers/models/deberta/configuration_deberta.py index 2efffc769187..7541e402d6d7 100644 --- a/src/transformers/models/deberta/configuration_deberta.py +++ b/src/transformers/models/deberta/configuration_deberta.py @@ -13,14 +13,14 @@ # limitations under the License. """DeBERTa model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/deberta-base") +@strict(accept_kwargs=True) class DebertaConfig(PreTrainedConfig): r""" max_relative_positions (`int`, *optional*, defaults to -1): @@ -58,65 +58,37 @@ class DebertaConfig(PreTrainedConfig): model_type = "deberta" - def __init__( - self, - vocab_size=50265, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=0, - initializer_range=0.02, - layer_norm_eps=1e-7, - relative_attention=False, - max_relative_positions=-1, - pad_token_id=0, - bos_token_id=None, - eos_token_id=None, - position_biased_input=True, - pos_att_type=None, - pooler_dropout=0, - pooler_hidden_act="gelu", - legacy=True, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.relative_attention = relative_attention - self.max_relative_positions = max_relative_positions - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.position_biased_input = position_biased_input - + vocab_size: int = 50265 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-7 + relative_attention: bool = False + max_relative_positions: int = -1 + pad_token_id: int | None = 0 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + position_biased_input: bool = True + pos_att_type: str | list[str] | None = None + pooler_dropout: float | int = 0.0 + pooler_hidden_act: str = "gelu" + legacy: bool = True + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): # Backwards compatibility - if isinstance(pos_att_type, str): - pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")] - - self.pos_att_type = pos_att_type - self.vocab_size = vocab_size - self.layer_norm_eps = layer_norm_eps + if isinstance(self.pos_att_type, str): + self.pos_att_type = [x.strip() for x in self.pos_att_type.lower().split("|")] - self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size) - self.pooler_dropout = pooler_dropout - self.pooler_hidden_act = pooler_hidden_act - self.legacy = legacy + self.pooler_hidden_size = kwargs.get("pooler_hidden_size", self.hidden_size) + super().__post_init__(**kwargs) __all__ = ["DebertaConfig"] diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index ac6db4d2bfbb..53d014d8ac86 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -660,7 +660,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -869,7 +869,7 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, @@ -978,7 +978,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, @@ -1071,7 +1071,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, @@ -1130,7 +1130,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, diff --git a/src/transformers/models/deberta_v2/configuration_deberta_v2.py b/src/transformers/models/deberta_v2/configuration_deberta_v2.py index 2ec79bc8eb18..97d8e5852095 100644 --- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py +++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py @@ -13,14 +13,14 @@ # limitations under the License. """DeBERTa-v2 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/deberta-v2-xlarge") +@strict(accept_kwargs=True) class DebertaV2Config(PreTrainedConfig): r""" max_relative_positions (`int`, *optional*, defaults to -1): @@ -58,65 +58,37 @@ class DebertaV2Config(PreTrainedConfig): model_type = "deberta-v2" - def __init__( - self, - vocab_size=128100, - hidden_size=1536, - num_hidden_layers=24, - num_attention_heads=24, - intermediate_size=6144, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=0, - initializer_range=0.02, - layer_norm_eps=1e-7, - relative_attention=False, - max_relative_positions=-1, - pad_token_id=0, - bos_token_id=None, - eos_token_id=None, - position_biased_input=True, - pos_att_type=None, - pooler_dropout=0, - pooler_hidden_act="gelu", - legacy=True, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.relative_attention = relative_attention - self.max_relative_positions = max_relative_positions - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.position_biased_input = position_biased_input - + vocab_size: int = 128100 + hidden_size: int = 1536 + num_hidden_layers: int = 24 + num_attention_heads: int = 24 + intermediate_size: int = 6144 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-7 + relative_attention: bool = False + max_relative_positions: int = -1 + pad_token_id: int | None = 0 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + position_biased_input: bool = True + pos_att_type: str | list[str] | None = None + pooler_dropout: float | int = 0.0 + pooler_hidden_act: str = "gelu" + legacy: bool = True + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): # Backwards compatibility - if isinstance(pos_att_type, str): - pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")] - - self.pos_att_type = pos_att_type - self.vocab_size = vocab_size - self.layer_norm_eps = layer_norm_eps + if isinstance(self.pos_att_type, str): + self.pos_att_type = [x.strip() for x in self.pos_att_type.lower().split("|")] - self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size) - self.pooler_dropout = pooler_dropout - self.pooler_hidden_act = pooler_hidden_act - self.legacy = legacy + self.pooler_hidden_size = kwargs.get("pooler_hidden_size", self.hidden_size) + super().__post_init__(**kwargs) __all__ = ["DebertaV2Config"] diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 82ac99b93d7f..f6b25925797d 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -738,7 +738,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -946,7 +946,7 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, @@ -1057,7 +1057,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, @@ -1151,7 +1151,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, @@ -1211,7 +1211,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, @@ -1306,7 +1306,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None diff --git a/src/transformers/models/decision_transformer/configuration_decision_transformer.py b/src/transformers/models/decision_transformer/configuration_decision_transformer.py index 80faf092dfee..2cb8dbe50260 100644 --- a/src/transformers/models/decision_transformer/configuration_decision_transformer.py +++ b/src/transformers/models/decision_transformer/configuration_decision_transformer.py @@ -13,14 +13,14 @@ # limitations under the License. """Decision Transformer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="") +@strict(accept_kwargs=True) class DecisionTransformerConfig(PreTrainedConfig): """ state_dim (`int`, *optional*, defaults to 17): @@ -62,59 +62,29 @@ class DecisionTransformerConfig(PreTrainedConfig): "num_hidden_layers": "n_layer", } - def __init__( - self, - state_dim=17, - act_dim=4, - hidden_size=128, - max_ep_len=4096, - action_tanh=True, - vocab_size=1, - n_positions=1024, - n_layer=3, - n_head=1, - n_inner=None, - activation_function="relu", - resid_pdrop=0.1, - embd_pdrop=0.1, - attn_pdrop=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - scale_attn_weights=True, - use_cache=True, - bos_token_id=50256, - eos_token_id=50256, - scale_attn_by_inverse_layer_idx=False, - reorder_and_upcast_attn=False, - add_cross_attention=False, - **kwargs, - ): - self.add_cross_attention = add_cross_attention - self.state_dim = state_dim - self.act_dim = act_dim - self.hidden_size = hidden_size - self.max_ep_len = max_ep_len - self.action_tanh = action_tanh - self.vocab_size = vocab_size - self.n_positions = n_positions - self.n_layer = n_layer - self.n_head = n_head - self.n_inner = n_inner - self.activation_function = activation_function - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.scale_attn_weights = scale_attn_weights - self.use_cache = use_cache - self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx - self.reorder_and_upcast_attn = reorder_and_upcast_attn - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - super().__init__(**kwargs) + state_dim: int = 17 + act_dim: int = 4 + hidden_size: int = 128 + max_ep_len: int = 4096 + action_tanh: bool = True + vocab_size: int = 1 + n_positions: int = 1024 + n_layer: int = 3 + n_head: int = 1 + n_inner: int | None = None + activation_function: str = "relu" + resid_pdrop: float = 0.1 + embd_pdrop: float = 0.1 + attn_pdrop: float = 0.1 + layer_norm_epsilon: float = 1e-5 + initializer_range: float = 0.02 + scale_attn_weights: bool = True + use_cache: bool = True + bos_token_id: int | None = 50256 + eos_token_id: int | list[int] | None = 50256 + scale_attn_by_inverse_layer_idx: bool = False + reorder_and_upcast_attn: bool = False + add_cross_attention: bool = False __all__ = ["DecisionTransformerConfig"] diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py index a7647e1d9d0e..cb48d8fad8d2 100755 --- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py +++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py @@ -599,7 +599,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict batch_size, seq_length = states.shape[0], states.shape[1] diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 6f46d6f2bd7e..682b795401d5 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -18,12 +18,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="deepseek-ai/DeepSeek-V2-Lite") +@strict(accept_kwargs=True) class DeepseekV2Config(PreTrainedConfig): r""" first_k_dense_replace (`int`, *optional*, defaults to 0): @@ -70,87 +73,59 @@ class DeepseekV2Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - first_k_dense_replace: int | None = 0, - kv_lora_rank: int | None = 512, - q_lora_rank: int | None = 1536, - n_group: int | None = None, - n_routed_experts: int | None = 64, - n_shared_experts: int | None = 2, - qk_nope_head_dim: int | None = 128, - qk_rope_head_dim: int | None = 64, - routed_scaling_factor: float | None = 1.0, - topk_group: int | None = None, - topk_method: str | None = "greedy", - norm_topk_prob: bool | None = False, - v_head_dim: int | None = 128, - num_experts_per_tok: int | None = None, - moe_intermediate_size: int | None = 1407, - **kwargs, - ): - self.first_k_dense_replace = first_k_dense_replace - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.n_group = n_group - self.n_routed_experts = n_routed_experts - self.n_shared_experts = n_shared_experts - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.routed_scaling_factor = routed_scaling_factor - self.topk_group = topk_group - self.topk_method = topk_method - self.norm_topk_prob = norm_topk_prob - self.v_head_dim = v_head_dim - self.num_experts_per_tok = num_experts_per_tok - self.moe_intermediate_size = moe_intermediate_size - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + pretraining_tp: int | None = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | None = 0.0 + mlp_bias: bool = False + head_dim: int | None = None + first_k_dense_replace: int = 0 + kv_lora_rank: int = 512 + q_lora_rank: int = 1536 + n_group: int | None = None + n_routed_experts: int = 64 + n_shared_experts: int = 2 + qk_nope_head_dim: int = 128 + qk_rope_head_dim: int = 64 + routed_scaling_factor: float = 1.0 + topk_group: int | None = None + topk_method: str | None = "greedy" + norm_topk_prob: bool | None = False + v_head_dim: int = 128 + num_experts_per_tok: int | None = None + moe_intermediate_size: int = 1407 - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias + def __post_init__(self, **kwargs): + self.head_dim = self.qk_rope_head_dim + if self.head_dim is None: + self.head_dim = self.hidden_size // self.num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.head_dim = qk_rope_head_dim - self.rope_parameters = rope_parameters + super().__post_init__(**kwargs) - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) __all__ = ["DeepseekV2Config"] diff --git a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py index 825bcfa24250..ef753629a936 100644 --- a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py @@ -16,6 +16,7 @@ import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -43,6 +44,7 @@ @auto_docstring(checkpoint="deepseek-ai/DeepSeek-V2-Lite") +@strict(accept_kwargs=True) class DeepseekV2Config(LlamaConfig): r""" first_k_dense_replace (`int`, *optional*, defaults to 0): @@ -84,64 +86,44 @@ class DeepseekV2Config(LlamaConfig): model_type = "deepseek_v2" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - first_k_dense_replace: int | None = 0, - kv_lora_rank: int | None = 512, - q_lora_rank: int | None = 1536, - n_group: int | None = None, - n_routed_experts: int | None = 64, - n_shared_experts: int | None = 2, - qk_nope_head_dim: int | None = 128, - qk_rope_head_dim: int | None = 64, - routed_scaling_factor: float | None = 1.0, - topk_group: int | None = None, - topk_method: str | None = "greedy", - norm_topk_prob: bool | None = False, - v_head_dim: int | None = 128, - num_experts_per_tok: int | None = None, - moe_intermediate_size: int | None = 1407, - **kwargs, - ): - self.first_k_dense_replace = first_k_dense_replace - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.n_group = n_group - self.n_routed_experts = n_routed_experts - self.n_shared_experts = n_shared_experts - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.routed_scaling_factor = routed_scaling_factor - self.topk_group = topk_group - self.topk_method = topk_method - self.norm_topk_prob = norm_topk_prob - self.v_head_dim = v_head_dim - self.num_experts_per_tok = num_experts_per_tok - self.moe_intermediate_size = moe_intermediate_size - - super().__init__(**kwargs) - - self.head_dim = qk_rope_head_dim - del self.pretraining_tp + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | None = 0.0 + mlp_bias: bool = False + first_k_dense_replace: int = 0 + kv_lora_rank: int = 512 + q_lora_rank: int = 1536 + n_group: int | None = None + n_routed_experts: int = 64 + n_shared_experts: int = 2 + qk_nope_head_dim: int = 128 + qk_rope_head_dim: int = 64 + routed_scaling_factor: float = 1.0 + topk_group: int | None = None + topk_method: str | None = "greedy" + norm_topk_prob: bool | None = False + v_head_dim: int = 128 + num_experts_per_tok: int | None = None + moe_intermediate_size: int = 1407 + + def __post_init__(self, **kwargs): + self.head_dim = self.qk_rope_head_dim + super().__post_init__(**kwargs) def apply_rotary_emb( diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index 6987ee716949..5504bf9ed76a 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -15,15 +15,15 @@ # limitations under the License. """DeepSeekV3 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring -DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} - - @auto_docstring(checkpoint="bzantium/tiny-deepseek-v3") +@strict(accept_kwargs=True) class DeepseekV3Config(PreTrainedConfig): r""" n_group (`int`, *optional*, defaults to 8): @@ -67,89 +67,50 @@ class DeepseekV3Config(PreTrainedConfig): "num_local_experts": "n_routed_experts", } - def __init__( - self, - vocab_size: int | None = 129280, - hidden_size: int | None = 7168, - intermediate_size: int | None = 18432, - moe_intermediate_size: int | None = 2048, - num_hidden_layers: int | None = 61, - num_attention_heads: int | None = 128, - num_key_value_heads: int | None = 128, - n_shared_experts: int | None = 1, - n_routed_experts: int | None = 256, - routed_scaling_factor: float | None = 2.5, - kv_lora_rank: int | None = 512, - q_lora_rank: int | None = 1536, - qk_rope_head_dim: int | None = 64, - v_head_dim: int | None = 128, - qk_nope_head_dim: int | None = 128, - n_group: int | None = 8, - topk_group: int | None = 4, - num_experts_per_tok: int | None = 8, - first_k_dense_replace: int | None = 3, - norm_topk_prob: bool | None = True, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 0, - eos_token_id: int | None = 1, - pretraining_tp: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - rope_interleave: bool | None = True, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.moe_intermediate_size = moe_intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.routed_scaling_factor = routed_scaling_factor - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim - self.head_dim = qk_rope_head_dim - self.n_group = n_group - self.topk_group = topk_group - self.num_experts_per_tok = num_experts_per_tok - self.first_k_dense_replace = first_k_dense_replace - self.norm_topk_prob = norm_topk_prob - self.rope_interleave = rope_interleave - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) - - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None = None, **kwargs): + vocab_size: int = 129280 + hidden_size: int = 7168 + intermediate_size: int = 18432 + moe_intermediate_size: int = 2048 + num_hidden_layers: int = 61 + num_attention_heads: int = 128 + num_key_value_heads: int | None = 128 + n_shared_experts: int = 1 + n_routed_experts: int = 256 + routed_scaling_factor: float = 2.5 + kv_lora_rank: int = 512 + q_lora_rank: int = 1536 + qk_rope_head_dim: int = 64 + v_head_dim: int | None = 128 + qk_nope_head_dim: int = 128 + n_group: int | None = 8 + topk_group: int | None = 4 + num_experts_per_tok: int | None = 8 + first_k_dense_replace: int | None = 3 + norm_topk_prob: bool | None = True + hidden_act: str = "silu" + max_position_embeddings: int = 4096 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + pretraining_tp: int | None = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + rope_interleave: bool | None = True + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + self.head_dim = self.qk_rope_head_dim + super().__post_init__(**kwargs) + + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} @@ -157,7 +118,6 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None # Standardize and validate the correctness of rotary position embeddings parameters self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) # Convert to float because RoPE fn expect a float. Models on the hub were saved as int for key in ["beta_fast", "beta_slow", "factor"]: diff --git a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py index 117f6f27976c..e1f9fbc017c0 100644 --- a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py @@ -19,6 +19,8 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -28,6 +30,7 @@ @auto_docstring(checkpoint="deepseek-community/deepseek-vl-1.3b-chat") +@strict(accept_kwargs=True) class DeepseekVLConfig(PreTrainedConfig): r""" Example: @@ -48,35 +51,27 @@ class DeepseekVLConfig(PreTrainedConfig): model_type = "deepseek_vl" sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - text_config: AutoConfig | None = None, - vision_config: AutoConfig | None = None, - image_token_id: int = 100015, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if text_config is None: - text_config = {} + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 100015 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = {} logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) - if vision_config is None: - vision_config = {} + if self.vision_config is None: + self.vision_config = {} logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.") + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "siglip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - self.image_token_id = image_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["DeepseekVLConfig"] diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py index 13d318ee7d71..5c5ea58a8c57 100644 --- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py @@ -15,6 +15,7 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...image_processing_utils import BatchFeature @@ -39,6 +40,7 @@ @auto_docstring(checkpoint="deepseek-community/deepseek-vl-1.3b-chat") +@strict(accept_kwargs=True) class DeepseekVLConfig(PreTrainedConfig): r""" Example: @@ -59,35 +61,27 @@ class DeepseekVLConfig(PreTrainedConfig): model_type = "deepseek_vl" sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - text_config: AutoConfig | None = None, - vision_config: AutoConfig | None = None, - image_token_id: int = 100015, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if text_config is None: - text_config = {} + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 100015 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = {} logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) - if vision_config is None: - vision_config = {} + if self.vision_config is None: + self.vision_config = {} logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.") + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "siglip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - self.image_token_id = image_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) class DeepseekVLBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast): diff --git a/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py index f45f6ebfd5fc..a4af0b47fb6a 100644 --- a/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py @@ -18,6 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -27,6 +29,7 @@ @auto_docstring(checkpoint="deepseek-community/deepseek-vl-7b-chat") +@strict(accept_kwargs=True) class DeepseekVLHybridConfig(PreTrainedConfig): r""" high_res_vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SamVisionConfig`): @@ -50,45 +53,40 @@ class DeepseekVLHybridConfig(PreTrainedConfig): model_type = "deepseek_vl_hybrid" sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "high_res_vision_config": AutoConfig} - def __init__( - self, - text_config: AutoConfig | None = None, - vision_config: AutoConfig | None = None, - high_res_vision_config: AutoConfig | None = None, - image_token_id: int = 100015, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if high_res_vision_config is None: - high_res_vision_config = {} - logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.") + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 100015 + tie_word_embeddings: bool = True + + high_res_vision_config: dict | PreTrainedConfig | None = None - if isinstance(high_res_vision_config, dict): - high_res_vision_config["model_type"] = high_res_vision_config.get("model_type", "sam_vision_model") - high_res_vision_config = CONFIG_MAPPING[high_res_vision_config["model_type"]](**high_res_vision_config) + def __post_init__(self, **kwargs): + if self.high_res_vision_config is None: + self.high_res_vision_config = {} + logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.") - self.high_res_vision_config = high_res_vision_config - if text_config is None: - text_config = {} + if isinstance(self.high_res_vision_config, dict): + self.high_res_vision_config["model_type"] = self.high_res_vision_config.get( + "model_type", "sam_vision_model" + ) + self.high_res_vision_config = CONFIG_MAPPING[self.high_res_vision_config["model_type"]]( + **self.high_res_vision_config + ) + if self.text_config is None: + self.text_config = {} logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) - if vision_config is None: - vision_config = {} + if self.vision_config is None: + self.vision_config = {} logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.") + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "siglip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - self.image_token_id = image_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["DeepseekVLHybridConfig"] diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index 2f1663575f21..43330a13ef5d 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -18,9 +18,11 @@ import torch import torch.nn as nn import torchvision.transforms.v2.functional as tvF +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...cache_utils import Cache +from ...configuration_utils import PreTrainedConfig from ...image_processing_utils_fast import ( BaseImageProcessorFast, BatchFeature, @@ -83,6 +85,7 @@ @auto_docstring(checkpoint="deepseek-community/deepseek-vl-7b-chat") +@strict(accept_kwargs=True) class DeepseekVLHybridConfig(DeepseekVLConfig): r""" high_res_vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SamVisionConfig`): @@ -106,32 +109,22 @@ class DeepseekVLHybridConfig(DeepseekVLConfig): model_type = "deepseek_vl_hybrid" sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "high_res_vision_config": AutoConfig} - def __init__( - self, - text_config: AutoConfig | None = None, - vision_config: AutoConfig | None = None, - high_res_vision_config: AutoConfig | None = None, - image_token_id: int = 100015, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if high_res_vision_config is None: - high_res_vision_config = {} - logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.") + high_res_vision_config: dict | PreTrainedConfig | None = None - if isinstance(high_res_vision_config, dict): - high_res_vision_config["model_type"] = high_res_vision_config.get("model_type", "sam_vision_model") - high_res_vision_config = CONFIG_MAPPING[high_res_vision_config["model_type"]](**high_res_vision_config) + def __post_init__(self, **kwargs): + if self.high_res_vision_config is None: + self.high_res_vision_config = {} + logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.") - self.high_res_vision_config = high_res_vision_config + if isinstance(self.high_res_vision_config, dict): + self.high_res_vision_config["model_type"] = self.high_res_vision_config.get( + "model_type", "sam_vision_model" + ) + self.high_res_vision_config = CONFIG_MAPPING[self.high_res_vision_config["model_type"]]( + **self.high_res_vision_config + ) - super().__init__( - text_config=text_config, - vision_config=vision_config, - image_token_id=image_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + super().__post_init__(**kwargs) @dataclass diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py index f6d464caa739..a793ac71346b 100644 --- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py +++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py @@ -13,16 +13,16 @@ # limitations under the License. """Deformable DETR model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="SenseTime/deformable-detr") +@strict(accept_kwargs=True) class DeformableDetrConfig(PreTrainedConfig): r""" num_queries (`int`, *optional*, defaults to 300): @@ -51,6 +51,8 @@ class DeformableDetrConfig(PreTrainedConfig): with_box_refine (`bool`, *optional*, defaults to `False`): Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes based on the predictions from the previous layer. + return_intermediate (`bool`, *optional*, defaults to True): + Whether to return the intermediate state or not Examples: @@ -74,61 +76,60 @@ class DeformableDetrConfig(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - backbone_config=None, - num_channels=3, - num_queries=300, - max_position_embeddings=1024, - encoder_layers=6, - encoder_ffn_dim=1024, - encoder_attention_heads=8, - decoder_layers=6, - decoder_ffn_dim=1024, - decoder_attention_heads=8, - encoder_layerdrop=0.0, - is_encoder_decoder=True, - activation_function="relu", - d_model=256, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - init_xavier_std=1.0, - auxiliary_loss=False, - position_embedding_type="sine", - dilation=False, - num_feature_levels=4, - encoder_n_points=4, - decoder_n_points=4, - two_stage=False, - two_stage_num_proposals=300, - with_box_refine=False, - class_cost=1, - bbox_cost=5, - giou_cost=2, - mask_loss_coefficient=1, - dice_loss_coefficient=1, - bbox_loss_coefficient=5, - giou_loss_coefficient=2, - eos_coefficient=0.1, - focal_alpha=0.25, - disable_custom_kernels=False, - tie_word_embeddings=True, - **kwargs, - ): + backbone_config: dict | PreTrainedConfig | None = None + num_channels: int = 3 + num_queries: int = 300 + max_position_embeddings: int = 1024 + encoder_layers: int = 6 + encoder_ffn_dim: int = 1024 + encoder_attention_heads: int = 8 + decoder_layers: int = 6 + decoder_ffn_dim: int = 1024 + decoder_attention_heads: int = 8 + encoder_layerdrop: float | int = 0.0 + is_encoder_decoder: bool = True + activation_function: str = "relu" + d_model: int = 256 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + init_xavier_std: float = 1.0 + return_intermediate: bool = True + auxiliary_loss: bool = False + position_embedding_type: str = "sine" + dilation: bool = False + num_feature_levels: int = 4 + encoder_n_points: int = 4 + decoder_n_points: int = 4 + two_stage: bool = False + two_stage_num_proposals: int = 300 + with_box_refine: bool = False + class_cost: int = 1 + bbox_cost: int = 5 + giou_cost: int = 2 + mask_loss_coefficient: int = 1 + dice_loss_coefficient: int = 1 + bbox_loss_coefficient: int = 5 + giou_loss_coefficient: int = 2 + eos_coefficient: float = 0.1 + focal_alpha: float = 0.25 + disable_custom_kernels: bool = False + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): # Init timm backbone with hardcoded values for BC timm_default_kwargs = { "num_channels": 3, "features_only": True, "use_pretrained_backbone": False, - "out_indices": [2, 3, 4] if num_feature_levels > 1 else [4], + "out_indices": [2, 3, 4] if self.num_feature_levels > 1 else [4], } - if dilation: + if self.dilation: timm_default_kwargs["output_stride"] = 16 - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_backbone="resnet50", default_config_type="resnet50", default_config_kwargs={"out_features": ["stage4"]}, @@ -136,51 +137,12 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - self.num_channels = num_channels - self.num_queries = num_queries - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.init_xavier_std = init_xavier_std - self.encoder_layerdrop = encoder_layerdrop - self.auxiliary_loss = auxiliary_loss - self.position_embedding_type = position_embedding_type - self.dilation = dilation - # deformable attributes - self.num_feature_levels = num_feature_levels - self.encoder_n_points = encoder_n_points - self.decoder_n_points = decoder_n_points - self.two_stage = two_stage - self.two_stage_num_proposals = two_stage_num_proposals - self.with_box_refine = with_box_refine - if two_stage is True and with_box_refine is False: + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.two_stage is True and self.with_box_refine is False: raise ValueError("If two_stage is True, with_box_refine must be True.") - # Hungarian matcher - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - # Loss coefficients - self.mask_loss_coefficient = mask_loss_coefficient - self.dice_loss_coefficient = dice_loss_coefficient - self.bbox_loss_coefficient = bbox_loss_coefficient - self.giou_loss_coefficient = giou_loss_coefficient - self.eos_coefficient = eos_coefficient - self.focal_alpha = focal_alpha - self.disable_custom_kernels = disable_custom_kernels - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) __all__ = ["DeformableDetrConfig"] diff --git a/src/transformers/models/deit/configuration_deit.py b/src/transformers/models/deit/configuration_deit.py index ec002a5f2234..9a4140748079 100644 --- a/src/transformers/models/deit/configuration_deit.py +++ b/src/transformers/models/deit/configuration_deit.py @@ -13,14 +13,14 @@ # limitations under the License. """DeiT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/deit-base-distilled-patch16-224") +@strict(accept_kwargs=True) class DeiTConfig(PreTrainedConfig): r""" encoder_stride (`int`, *optional*, defaults to 16): @@ -47,44 +47,26 @@ class DeiTConfig(PreTrainedConfig): model_type = "deit" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - encoder_stride=16, - pooler_output_size=None, - pooler_act="tanh", - **kwargs, - ): - super().__init__(**kwargs) + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + encoder_stride: int = 16 + pooler_output_size: int | None = None + pooler_act: str = "tanh" - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.encoder_stride = encoder_stride - self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size - self.pooler_act = pooler_act + def __post_init__(self, **kwargs): + self.pooler_output_size = self.pooler_output_size if self.pooler_output_size else self.hidden_size + super().__post_init__(**kwargs) __all__ = ["DeiTConfig"] diff --git a/src/transformers/models/depth_anything/configuration_depth_anything.py b/src/transformers/models/depth_anything/configuration_depth_anything.py index 2a3988b469c7..8f5da6ca0700 100644 --- a/src/transformers/models/depth_anything/configuration_depth_anything.py +++ b/src/transformers/models/depth_anything/configuration_depth_anything.py @@ -13,16 +13,16 @@ # limitations under the License. """DepthAnything model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto.configuration_auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="LiheYoung/depth-anything-small-hf") +@strict(accept_kwargs=True) class DepthAnythingConfig(PreTrainedConfig): r""" reassemble_hidden_size (`int`, *optional*, defaults to 384): @@ -61,23 +61,21 @@ class DepthAnythingConfig(PreTrainedConfig): model_type = "depth_anything" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - backbone_config=None, - patch_size=14, - initializer_range=0.02, - reassemble_hidden_size=384, - reassemble_factors=[4, 2, 1, 0.5], - neck_hidden_sizes=[48, 96, 192, 384], - fusion_hidden_size=64, - head_in_index=-1, - head_hidden_size=32, - depth_estimation_type="relative", - max_depth=None, - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + patch_size: int | list[int] | tuple[int, int] = 14 + initializer_range: float = 0.02 + reassemble_hidden_size: int = 384 + reassemble_factors: list[int | float] | tuple[int | float, ...] = (4, 2, 1, 0.5) + neck_hidden_sizes: list[int] | tuple[int, ...] = (48, 96, 192, 384) + fusion_hidden_size: int = 64 + head_in_index: int = -1 + head_hidden_size: int = 32 + depth_estimation_type: str = "relative" + max_depth: int | None = None + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="dinov2", default_config_kwargs={ "image_size": 518, @@ -89,21 +87,13 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - self.reassemble_hidden_size = reassemble_hidden_size - self.patch_size = patch_size - self.initializer_range = initializer_range - self.reassemble_factors = reassemble_factors - self.neck_hidden_sizes = neck_hidden_sizes - self.fusion_hidden_size = fusion_hidden_size - self.head_in_index = head_in_index - self.head_hidden_size = head_hidden_size - if depth_estimation_type not in ["relative", "metric"]: - raise ValueError("depth_estimation_type must be one of ['relative', 'metric']") - self.depth_estimation_type = depth_estimation_type - self.max_depth = max_depth if max_depth else 1 + self.max_depth = self.max_depth if self.max_depth else 1 + super().__post_init__(**kwargs) - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.depth_estimation_type not in ["relative", "metric"]: + raise ValueError("depth_estimation_type must be one of ['relative', 'metric']") __all__ = ["DepthAnythingConfig"] diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py index 16e1e3c0319c..3da9cef51e4a 100644 --- a/src/transformers/models/depth_anything/modeling_depth_anything.py +++ b/src/transformers/models/depth_anything/modeling_depth_anything.py @@ -378,7 +378,7 @@ def forward( if labels is not None: raise NotImplementedError("Training is not implemented yet") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index b07514a8e718..698fd36cecdf 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -15,6 +15,8 @@ from copy import deepcopy +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto.configuration_auto import CONFIG_MAPPING, AutoConfig @@ -24,6 +26,7 @@ @auto_docstring(checkpoint="apple/DepthPro") +@strict(accept_kwargs=True) class DepthProConfig(PreTrainedConfig): r""" fusion_hidden_size (`int`, *optional*, defaults to 256): @@ -76,78 +79,29 @@ class DepthProConfig(PreTrainedConfig): model_type = "depth_pro" sub_configs = {"image_model_config": AutoConfig, "patch_model_config": AutoConfig, "fov_model_config": AutoConfig} - def __init__( - self, - fusion_hidden_size=256, - patch_size=384, - initializer_range=0.02, - intermediate_hook_ids=[11, 5], - intermediate_feature_dims=[256, 256], - scaled_images_ratios=[0.25, 0.5, 1], - scaled_images_overlap_ratios=[0.0, 0.5, 0.25], - scaled_images_feature_dims=[1024, 1024, 512], - merge_padding_value=3, - use_batch_norm_in_fusion_residual=False, - use_bias_in_fusion_residual=True, - use_fov_model=False, - num_fov_head_layers=2, - image_model_config=None, - patch_model_config=None, - fov_model_config=None, - **kwargs, - ): - # scaled_images_ratios is sorted - if scaled_images_ratios != sorted(scaled_images_ratios): - raise ValueError( - f"Values in scaled_images_ratios={scaled_images_ratios} should be sorted from low to high" - ) - - # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent - if not (len(scaled_images_ratios) == len(scaled_images_overlap_ratios) == len(scaled_images_feature_dims)): - raise ValueError( - f"len(scaled_images_ratios)={len(scaled_images_ratios)} and " - f"len(scaled_images_overlap_ratios)={len(scaled_images_overlap_ratios)} and " - f"len(scaled_images_feature_dims)={len(scaled_images_feature_dims)}, " - f"should match in config." - ) - - # intermediate_hook_ids, intermediate_feature_dims should be consistent - if not (len(intermediate_hook_ids) == len(intermediate_feature_dims)): - raise ValueError( - f"len(intermediate_hook_ids)={len(intermediate_hook_ids)} and " - f"len(intermediate_feature_dims)={len(intermediate_feature_dims)}, " - f"should match in config." - ) - - # fusion_hidden_size should be consistent with num_fov_head_layers - if fusion_hidden_size // 2**num_fov_head_layers == 0: - raise ValueError( - f"fusion_hidden_size={fusion_hidden_size} should be consistent with num_fov_head_layers={num_fov_head_layers} " - "i.e fusion_hidden_size // 2**num_fov_head_layers > 0" - ) - - self.fusion_hidden_size = fusion_hidden_size - self.patch_size = patch_size - self.initializer_range = initializer_range - self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual - self.use_bias_in_fusion_residual = use_bias_in_fusion_residual - self.use_fov_model = use_fov_model - self.num_fov_head_layers = num_fov_head_layers - self.intermediate_hook_ids = intermediate_hook_ids - self.intermediate_feature_dims = intermediate_feature_dims - self.scaled_images_ratios = scaled_images_ratios - self.scaled_images_overlap_ratios = scaled_images_overlap_ratios - self.scaled_images_feature_dims = scaled_images_feature_dims - self.merge_padding_value = merge_padding_value - self.image_model_config = image_model_config - self.patch_model_config = patch_model_config - self.fov_model_config = fov_model_config - + fusion_hidden_size: int = 256 + patch_size: int | list[int] | tuple[int, int] = 384 + initializer_range: float = 0.02 + intermediate_hook_ids: list[int] | tuple[int, ...] = (11, 5) + intermediate_feature_dims: list[int] | tuple[int, ...] = (256, 256) + scaled_images_ratios: list[int | float] | tuple[int | float, ...] = (0.25, 0.5, 1) + scaled_images_overlap_ratios: list[float] | tuple[float, ...] = (0.0, 0.5, 0.25) + scaled_images_feature_dims: list[int] | tuple[int, ...] = (1024, 1024, 512) + merge_padding_value: int = 3 + use_batch_norm_in_fusion_residual: bool = False + use_bias_in_fusion_residual: bool = True + use_fov_model: bool = False + num_fov_head_layers: int = 2 + image_model_config: dict | PreTrainedConfig | None = None + patch_model_config: dict | PreTrainedConfig | None = None + fov_model_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): for sub_config_key in self.sub_configs: sub_config = getattr(self, sub_config_key) if sub_config is None: - sub_config = CONFIG_MAPPING["dinov2"](image_size=patch_size) + sub_config = CONFIG_MAPPING["dinov2"](image_size=self.patch_size) logger.info( f"`{sub_config_key}` is `None`. Initializing `{sub_config_key}` with the `Dinov2Config` " f"with default values except `{sub_config_key}.image_size` is set to `config.patch_size`." @@ -163,20 +117,20 @@ def __init__( f"The model type `{sub_config['model_type']}` in `{sub_config_key}` is not supported. Please provide a valid model type." ) image_size = sub_config.get("image_size") - if image_size != patch_size: + if image_size != self.patch_size: logger.info( f"The `image_size` in `{sub_config_key}` is set to `{image_size}`, " - f"but it does not match the required `patch_size` of `{patch_size}`. " - f"Updating `image_size` to `{patch_size}` for consistency. " + f"but it does not match the required `patch_size` of `{self.patch_size}`. " + f"Updating `image_size` to `{self.patch_size}` for consistency. " f"Ensure that `image_size` aligns with `patch_size` in the configuration." ) - sub_config.update({"image_size": patch_size}) + sub_config.update({"image_size": self.patch_size}) sub_config = CONFIG_MAPPING[sub_config["model_type"]](**sub_config) elif isinstance(sub_config, PreTrainedConfig): image_size = getattr(sub_config, "image_size", None) - if image_size != patch_size: + if image_size != self.patch_size: raise ValueError( - f"`config.{sub_config_key}.image_size={image_size}` should match `config.patch_size={patch_size}`." + f"`config.{sub_config_key}.image_size={image_size}` should match `config.patch_size={self.patch_size}`." ) else: raise TypeError( @@ -185,7 +139,43 @@ def __init__( setattr(self, sub_config_key, sub_config) - super().__init__(**kwargs) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + # scaled_images_ratios is sorted + if list(self.scaled_images_ratios) != sorted(self.scaled_images_ratios): + raise ValueError( + f"Values in scaled_images_ratios={self.scaled_images_ratios} should be sorted from low to high" + ) + + # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent + if not ( + len(self.scaled_images_ratios) + == len(self.scaled_images_overlap_ratios) + == len(self.scaled_images_feature_dims) + ): + raise ValueError( + f"len(scaled_images_ratios)={len(self.scaled_images_ratios)} and " + f"len(scaled_images_overlap_ratios)={len(self.scaled_images_overlap_ratios)} and " + f"len(scaled_images_feature_dims)={len(self.scaled_images_feature_dims)}, " + f"should match in config." + ) + + # intermediate_hook_ids, intermediate_feature_dims should be consistent + if not (len(self.intermediate_hook_ids) == len(self.intermediate_feature_dims)): + raise ValueError( + f"len(intermediate_hook_ids)={len(self.intermediate_hook_ids)} and " + f"len(intermediate_feature_dims)={len(self.intermediate_feature_dims)}, " + f"should match in config." + ) + + # fusion_hidden_size should be consistent with num_fov_head_layers + if self.fusion_hidden_size // 2**self.num_fov_head_layers == 0: + raise ValueError( + f"fusion_hidden_size={self.fusion_hidden_size} should be consistent with num_fov_head_layers={self.num_fov_head_layers} " + "i.e fusion_hidden_size // 2**num_fov_head_layers > 0" + ) __all__ = ["DepthProConfig"] diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index dfa9fb5d5f79..f8ee3c84b716 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -679,7 +679,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict encodings = self.encoder( pixel_values, @@ -1082,7 +1082,7 @@ def forward( if labels is not None: raise NotImplementedError("Training is not implemented yet") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index 403082b44fa5..0dfe92d06e91 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -13,16 +13,16 @@ # limitations under the License. """DETR model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="facebook/detr-resnet-50") +@strict(accept_kwargs=True) class DetrConfig(PreTrainedConfig): r""" num_queries (`int`, *optional*, defaults to 100): @@ -55,93 +55,60 @@ class DetrConfig(PreTrainedConfig): attribute_map = { "hidden_size": "d_model", "num_attention_heads": "encoder_attention_heads", + "num_hidden_layers": "encoder_layers", } - def __init__( - self, - backbone_config=None, - num_channels=3, - num_queries=100, - encoder_layers=6, - encoder_ffn_dim=2048, - encoder_attention_heads=8, - decoder_layers=6, - decoder_ffn_dim=2048, - decoder_attention_heads=8, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - is_encoder_decoder=True, - activation_function="relu", - d_model=256, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - init_xavier_std=1.0, - auxiliary_loss=False, - position_embedding_type="sine", - dilation=False, - class_cost=1, - bbox_cost=5, - giou_cost=2, - mask_loss_coefficient=1, - dice_loss_coefficient=1, - bbox_loss_coefficient=5, - giou_loss_coefficient=2, - eos_coefficient=0.1, - **kwargs, - ): + backbone_config: dict | PreTrainedConfig | None = None + num_channels: int = 3 + num_queries: int = 100 + encoder_layers: int = 6 + encoder_ffn_dim: int = 2048 + encoder_attention_heads: int = 8 + decoder_layers: int = 6 + decoder_ffn_dim: int = 2048 + decoder_attention_heads: int = 8 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + is_encoder_decoder: bool = True + activation_function: str = "relu" + d_model: int = 256 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + init_xavier_std: float = 1.0 + auxiliary_loss: bool = False + position_embedding_type: str = "sine" + dilation: bool = False + class_cost: int = 1 + bbox_cost: int = 5 + giou_cost: int = 2 + mask_loss_coefficient: int = 1 + dice_loss_coefficient: int = 1 + bbox_loss_coefficient: int = 5 + giou_loss_coefficient: int = 2 + eos_coefficient: float = 0.1 + + def __post_init__(self, **kwargs): backbone_kwargs = kwargs.get("backbone_kwargs", {}) timm_default_kwargs = { - "num_channels": backbone_kwargs.get("num_channels", num_channels), + "num_channels": backbone_kwargs.get("num_channels", self.num_channels), "features_only": True, "use_pretrained_backbone": False, "out_indices": backbone_kwargs.get("out_indices", [1, 2, 3, 4]), } - if dilation: + if self.dilation: timm_default_kwargs["output_stride"] = backbone_kwargs.get("output_stride", 16) - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_backbone="resnet50", default_config_type="resnet", default_config_kwargs={"out_features": ["stage4"]}, timm_default_kwargs=timm_default_kwargs, **kwargs, ) - - self.backbone_config = backbone_config - self.num_channels = num_channels - self.num_queries = num_queries - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.init_xavier_std = init_xavier_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.num_hidden_layers = encoder_layers - self.auxiliary_loss = auxiliary_loss - self.position_embedding_type = position_embedding_type - # Hungarian matcher - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - # Loss coefficients - self.mask_loss_coefficient = mask_loss_coefficient - self.dice_loss_coefficient = dice_loss_coefficient - self.bbox_loss_coefficient = bbox_loss_coefficient - self.giou_loss_coefficient = giou_loss_coefficient - self.eos_coefficient = eos_coefficient - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__post_init__(**kwargs) __all__ = ["DetrConfig"] diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index 7d32cfcf316d..3bfdc0efc3e3 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -13,6 +13,8 @@ # limitations under the License. """Dia model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -22,42 +24,26 @@ @auto_docstring(checkpoint="nari-labs/Dia-1.6B") +@strict(accept_kwargs=True) class DiaEncoderConfig(PreTrainedConfig): model_type = "dia_encoder" - def __init__( - self, - max_position_embeddings: int = 1024, - num_hidden_layers: int = 12, - hidden_size: int = 1024, - num_attention_heads: int = 16, - num_key_value_heads: int = 16, - head_dim: int = 128, - intermediate_size: int = 4096, - norm_eps: float = 1e-5, - vocab_size: int = 256, - hidden_act: str = "silu", - rope_parameters: RopeParameters | None = None, - initializer_range: float = 0.02, - **kwargs, - ): - self.max_position_embeddings = max_position_embeddings - self.num_hidden_layers = num_hidden_layers - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.norm_eps = norm_eps - self.vocab_size = vocab_size - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + max_position_embeddings: int = 1024 + num_hidden_layers: int = 12 + hidden_size: int = 1024 + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + head_dim: int = 128 + intermediate_size: int = 4096 + norm_eps: float = 1e-5 + vocab_size: int = 256 + hidden_act: str = "silu" + rope_parameters: dict | None = None + initializer_range: float = 0.02 @auto_docstring(checkpoint="nari-labs/Dia-1.6B") +@strict(accept_kwargs=True) class DiaDecoderConfig(PreTrainedConfig): r""" cross_num_attention_heads (`int`, *optional*, defaults to 16): @@ -72,58 +58,32 @@ class DiaDecoderConfig(PreTrainedConfig): model_type = "dia_decoder" - def __init__( - self, - max_position_embeddings: int = 3072, - num_hidden_layers: int = 18, - hidden_size: int = 2048, - intermediate_size: int = 8192, - num_attention_heads: int = 16, - num_key_value_heads: int = 4, - head_dim: int = 128, - cross_num_attention_heads: int = 16, - cross_head_dim: int = 128, - cross_num_key_value_heads: int = 16, - cross_hidden_size: int = 1024, - norm_eps: float = 1e-5, - vocab_size: int = 1028, - hidden_act: str = "silu", - num_channels: int = 9, - rope_parameters: RopeParameters | None = None, - initializer_range: float = 0.02, - use_cache: bool = True, - is_encoder_decoder: bool = True, - pad_token_id: int = 1025, - eos_token_id: int = 1024, - bos_token_id: int = 1026, - **kwargs, - ): - self.max_position_embeddings = max_position_embeddings - self.num_hidden_layers = num_hidden_layers - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.head_dim = head_dim - self.cross_num_key_value_heads = cross_num_key_value_heads - self.cross_num_attention_heads = cross_num_attention_heads - self.cross_head_dim = cross_head_dim - self.cross_hidden_size = cross_hidden_size - self.norm_eps = norm_eps - self.vocab_size = vocab_size - self.hidden_act = hidden_act - self.num_channels = num_channels - self.initializer_range = initializer_range - self.use_cache = use_cache - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.bos_token_id = bos_token_id - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + max_position_embeddings: int = 3072 + num_hidden_layers: int = 18 + hidden_size: int = 2048 + intermediate_size: int = 8192 + num_attention_heads: int = 16 + num_key_value_heads: int = 4 + head_dim: int = 128 + cross_num_attention_heads: int = 16 + cross_head_dim: int = 128 + cross_num_key_value_heads: int = 16 + cross_hidden_size: int = 1024 + norm_eps: float = 1e-5 + vocab_size: int = 1028 + hidden_act: str = "silu" + num_channels: int = 9 + rope_parameters: RopeParameters | dict | None = None + initializer_range: float = 0.02 + use_cache: bool = True + is_encoder_decoder: bool = True + pad_token_id: int | None = 1025 + eos_token_id: int | None = 1024 + bos_token_id: int | None = 1026 @auto_docstring(checkpoint="nari-labs/Dia-1.6B") +@strict(accept_kwargs=True) class DiaConfig(PreTrainedConfig): r""" delay_pattern (`list[int]`, *optional*, defaults to `[0, 8, 9, 10, 11, 12, 13, 14, 15]`): @@ -149,57 +109,57 @@ class DiaConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] sub_configs = {"encoder_config": DiaEncoderConfig, "decoder_config": DiaDecoderConfig} - def __init__( - self, - encoder_config: DiaEncoderConfig | None = None, - decoder_config: DiaDecoderConfig | None = None, - norm_eps: float = 1e-5, - is_encoder_decoder: bool = True, - pad_token_id: int | None = None, - eos_token_id: int | None = None, - bos_token_id: int | None = None, - delay_pattern: list[int] | None = None, - initializer_range: float = 0.02, - use_cache: bool = True, - **kwargs, - ): - if isinstance(encoder_config, dict): - encoder_config = DiaEncoderConfig(**encoder_config) - if isinstance(decoder_config, dict): - decoder_config = DiaDecoderConfig(**decoder_config) - self.encoder_config = encoder_config if encoder_config is not None else DiaEncoderConfig() - self.decoder_config = decoder_config if decoder_config is not None else DiaDecoderConfig() - self.norm_eps = norm_eps - self.delay_pattern = delay_pattern if delay_pattern is not None else [0, 8, 9, 10, 11, 12, 13, 14, 15] - self.initializer_range = initializer_range - self.use_cache = use_cache - - # TODO: Remove token ID forwarding once the `nari-labs/Dia-1.6B` - # checkpoint is updated - if pad_token_id is not None: + encoder_config: DiaEncoderConfig | dict | None = None + decoder_config: DiaDecoderConfig | dict | None = None + norm_eps: float = 1e-5 + is_encoder_decoder: bool = True + pad_token_id: int | None = None + eos_token_id: int | list[int] | None = None + bos_token_id: int | None = None + delay_pattern: list[int] | None = None + initializer_range: float = 0.02 + use_cache: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.encoder_config, dict): + self.encoder_config = DiaEncoderConfig(**self.encoder_config) + if isinstance(self.decoder_config, dict): + self.decoder_config = DiaDecoderConfig(**self.decoder_config) + + self.encoder_config = self.encoder_config if self.encoder_config is not None else DiaEncoderConfig() + self.decoder_config = self.decoder_config if self.decoder_config is not None else DiaDecoderConfig() + self.delay_pattern = ( + self.delay_pattern if self.delay_pattern is not None else [0, 8, 9, 10, 11, 12, 13, 14, 15] + ) + + # TODO: Remove token ID forwarding once the `nari-labs/Dia-1.6B` checkpoint is updated + if self.pad_token_id is not None: logger.warning_once( "Passing `pad_token_id` to `DiaConfig` is deprecated. " "Please set it directly on `DiaDecoderConfig` instead." ) - self.decoder_config.pad_token_id = pad_token_id - if eos_token_id is not None: + self.decoder_config.pad_token_id = self.pad_token_id + + if self.eos_token_id is not None: logger.warning_once( "Passing `eos_token_id` to `DiaConfig` is deprecated. " "Please set it directly on `DiaDecoderConfig` instead." ) - self.decoder_config.eos_token_id = eos_token_id - if bos_token_id is not None: + self.decoder_config.eos_token_id = self.eos_token_id + + if self.bos_token_id is not None: logger.warning_once( "Passing `bos_token_id` to `DiaConfig` is deprecated. " "Please set it directly on `DiaDecoderConfig` instead." ) - self.decoder_config.bos_token_id = bos_token_id + self.decoder_config.bos_token_id = self.bos_token_id - assert self.decoder_config.num_channels == len(self.delay_pattern), ( - "Number of channels must match delay pattern length." - ) + super().__post_init__(**kwargs) - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.decoder_config.num_channels != len(self.delay_pattern): + raise ValueError("Number of channels must match delay pattern length.") def get_text_config(self, *args, **kwargs): """Defaulting to audio config as it's the decoder in this case which is usually the text backbone""" diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index d11bf227cc78..ae8daa030609 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -16,12 +16,15 @@ # limitations under the License. """DiffLlama model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="kajuma/DiffLlama-0.3B-handcut") +@strict(accept_kwargs=True) class DiffLlamaConfig(PreTrainedConfig): r""" lambda_std_dev (`float`, *optional*, defaults to 0.1): @@ -43,57 +46,34 @@ class DiffLlamaConfig(PreTrainedConfig): model_type = "diffllama" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 2048, - intermediate_size: int | None = 8192, - num_hidden_layers: int | None = 16, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - lambda_std_dev: float | None = 0.1, - head_dim: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads + vocab_size: int = 32000 + hidden_size: int = 2048 + intermediate_size: int = 8192 + num_hidden_layers: int = 16 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + lambda_std_dev: float | None = 0.1 + head_dim: int | None = None + def __post_init__(self, **kwargs): # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.lambda_std_dev = lambda_std_dev - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["DiffLlamaConfig"] diff --git a/src/transformers/models/dinat/configuration_dinat.py b/src/transformers/models/dinat/configuration_dinat.py index bbfe9557f114..727048ebde01 100644 --- a/src/transformers/models/dinat/configuration_dinat.py +++ b/src/transformers/models/dinat/configuration_dinat.py @@ -13,15 +13,15 @@ # limitations under the License. """Dilated Neighborhood Attention Transformer model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="shi-labs/dinat-mini-in1k-224") +@strict(accept_kwargs=True) class DinatConfig(BackboneConfigMixin, PreTrainedConfig): r""" dilations (`list[list[int]]`, *optional*, defaults to `[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]]`): @@ -49,52 +49,37 @@ class DinatConfig(BackboneConfigMixin, PreTrainedConfig): "num_hidden_layers": "num_layers", } - def __init__( - self, - patch_size=4, - num_channels=3, - embed_dim=64, - depths=[3, 4, 6, 5], - num_heads=[2, 4, 8, 16], - kernel_size=7, - dilations=[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]], - mlp_ratio=3.0, - qkv_bias=True, - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - drop_path_rate=0.1, - hidden_act="gelu", - initializer_range=0.02, - layer_norm_eps=1e-5, - layer_scale_init_value=0.0, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) - - self.patch_size = patch_size - self.num_channels = num_channels - self.embed_dim = embed_dim - self.depths = depths - self.num_layers = len(depths) - self.num_heads = num_heads - self.kernel_size = kernel_size - self.dilations = dilations - self.mlp_ratio = mlp_ratio - self.qkv_bias = qkv_bias - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.drop_path_rate = drop_path_rate - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range + patch_size: int | list[int] | tuple[int, int] = 4 + num_channels: int = 3 + embed_dim: int = 64 + depths: list[int] | tuple[int, ...] = (3, 4, 6, 5) + num_heads: list[int] | tuple[int, ...] = (2, 4, 8, 16) + kernel_size: int = 7 + dilations: list | tuple | None = None + mlp_ratio: float = 3.0 + qkv_bias: bool = True + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + drop_path_rate: float = 0.1 + hidden_act: str = "gelu" + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + layer_scale_init_value: float = 0.0 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + + def __post_init__(self, **kwargs): + self.num_layers = len(self.depths) + self.dilations = self.dilations or [[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]] + # we set the hidden_size attribute in order to make Dinat work with VisionEncoderDecoderModel # this indicates the channel dimension after the last stage of the model - self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) - self.layer_scale_init_value = layer_scale_init_value - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.hidden_size = int(self.embed_dim * 2 ** (len(self.depths) - 1)) + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["DinatConfig"] diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py index 731e10a07db8..783231ff0192 100644 --- a/src/transformers/models/dinat/modeling_dinat.py +++ b/src/transformers/models/dinat/modeling_dinat.py @@ -601,7 +601,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -676,7 +676,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.dinat( pixel_values, @@ -771,7 +771,7 @@ def forward( >>> list(feature_maps[-1].shape) [1, 512, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/dinov2/configuration_dinov2.py b/src/transformers/models/dinov2/configuration_dinov2.py index c24d99c2d1eb..ed37163e76fd 100644 --- a/src/transformers/models/dinov2/configuration_dinov2.py +++ b/src/transformers/models/dinov2/configuration_dinov2.py @@ -13,15 +13,15 @@ # limitations under the License. """DINOv2 model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="google/dinov2-base-patch16-224") +@strict(accept_kwargs=True) class Dinov2Config(BackboneConfigMixin, PreTrainedConfig): r""" layerscale_value (`float`, *optional*, defaults to 1.0): @@ -54,54 +54,34 @@ class Dinov2Config(BackboneConfigMixin, PreTrainedConfig): model_type = "dinov2" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - mlp_ratio=4, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=224, - patch_size=14, - num_channels=3, - qkv_bias=True, - layerscale_value=1.0, - drop_path_rate=0.0, - use_swiglu_ffn=False, - out_features=None, - out_indices=None, - apply_layernorm=True, - reshape_hidden_states=True, - use_mask_token=True, - **kwargs, - ): - super().__init__(**kwargs) + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + mlp_ratio: int = 4 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 14 + num_channels: int = 3 + qkv_bias: bool = True + layerscale_value: float = 1.0 + drop_path_rate: float = 0.0 + use_swiglu_ffn: bool = False + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + apply_layernorm: bool = True + reshape_hidden_states: bool = True + use_mask_token: bool = True - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mlp_ratio = mlp_ratio - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.layerscale_value = layerscale_value - self.drop_path_rate = drop_path_rate - self.use_swiglu_ffn = use_swiglu_ffn - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) - self.apply_layernorm = apply_layernorm - self.reshape_hidden_states = reshape_hidden_states - self.use_mask_token = use_mask_token + def __post_init__(self, **kwargs): + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["Dinov2Config"] diff --git a/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py index 90933fe40018..8cb573af1f3d 100644 --- a/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +++ b/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py @@ -20,12 +20,15 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/dinov2-with-registers-base") +@strict(accept_kwargs=True) class Dinov2WithRegistersConfig(BackboneConfigMixin, PreTrainedConfig): r""" layerscale_value (`float`, *optional*, defaults to 1.0): @@ -58,54 +61,34 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "dinov2_with_registers" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - mlp_ratio=4, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - layerscale_value=1.0, - drop_path_rate=0.0, - use_swiglu_ffn=False, - num_register_tokens=4, - out_features=None, - out_indices=None, - apply_layernorm=True, - reshape_hidden_states=True, - **kwargs, - ): - super().__init__(**kwargs) + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + mlp_ratio: int = 4 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + layerscale_value: float = 1.0 + drop_path_rate: float = 0.0 + use_swiglu_ffn: bool = False + num_register_tokens: int = 4 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + apply_layernorm: bool = True + reshape_hidden_states: bool = True - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mlp_ratio = mlp_ratio - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.layerscale_value = layerscale_value - self.drop_path_rate = drop_path_rate - self.use_swiglu_ffn = use_swiglu_ffn - self.num_register_tokens = num_register_tokens - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) - self.apply_layernorm = apply_layernorm - self.reshape_hidden_states = reshape_hidden_states + def __post_init__(self, **kwargs): + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["Dinov2WithRegistersConfig"] diff --git a/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py index 9067b7ff0b7a..e99ebf2d4587 100644 --- a/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +++ b/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py @@ -15,6 +15,7 @@ import torch +from huggingface_hub.dataclasses import strict from torch import nn from ....transformers.models.dinov2.modeling_dinov2 import ( @@ -37,6 +38,7 @@ @auto_docstring(checkpoint="facebook/dinov2-with-registers-base") +@strict(accept_kwargs=True) class Dinov2WithRegistersConfig(BackboneConfigMixin, PreTrainedConfig): r""" layerscale_value (`float`, *optional*, defaults to 1.0): @@ -69,54 +71,34 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "dinov2_with_registers" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - mlp_ratio=4, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - layerscale_value=1.0, - drop_path_rate=0.0, - use_swiglu_ffn=False, - num_register_tokens=4, - out_features=None, - out_indices=None, - apply_layernorm=True, - reshape_hidden_states=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mlp_ratio = mlp_ratio - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.layerscale_value = layerscale_value - self.drop_path_rate = drop_path_rate - self.use_swiglu_ffn = use_swiglu_ffn - self.num_register_tokens = num_register_tokens - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) - self.apply_layernorm = apply_layernorm - self.reshape_hidden_states = reshape_hidden_states + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + mlp_ratio: int = 4 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + layerscale_value: float = 1.0 + drop_path_rate: float = 0.0 + use_swiglu_ffn: bool = False + num_register_tokens: int = 4 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + apply_layernorm: bool = True + reshape_hidden_states: bool = True + + def __post_init__(self, **kwargs): + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) class Dinov2WithRegistersPatchEmbeddings(Dinov2PatchEmbeddings): diff --git a/src/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py b/src/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py index fb8a3e50a6ef..6cd667d55fd2 100644 --- a/src/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +++ b/src/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py @@ -13,15 +13,15 @@ # limitations under the License. """ConvNeXT model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/dinov3-convnext-tiny-pretrain-lvd1689m") +@strict(accept_kwargs=True) class DINOv3ConvNextConfig(BackboneConfigMixin, PreTrainedConfig): r""" Example: @@ -40,34 +40,26 @@ class DINOv3ConvNextConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "dinov3_convnext" - def __init__( - self, - num_channels: int = 3, - hidden_sizes: list[int] | None = None, - depths: list[int] | None = None, - hidden_act: str = "gelu", - initializer_range: float = 0.02, - layer_norm_eps: float = 1e-6, - layer_scale_init_value: float = 1e-6, - drop_path_rate: float = 0.0, - image_size: int = 224, - out_features: list[str] | None = None, - out_indices: list[int] | None = None, - **kwargs, - ): - super().__init__(**kwargs) + num_channels: int = 3 + hidden_sizes: list[int] | None = None + depths: list[int] | None = None + hidden_act: str = "gelu" + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + layer_scale_init_value: float = 1e-6 + drop_path_rate: float = 0.0 + image_size: int | list[int] | tuple[int, int] = 224 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None - self.num_channels = num_channels - self.hidden_sizes = [96, 192, 384, 768] if hidden_sizes is None else hidden_sizes - self.depths = [3, 3, 9, 3] if depths is None else depths - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.layer_scale_init_value = layer_scale_init_value - self.drop_path_rate = drop_path_rate - self.image_size = image_size + def __post_init__(self, **kwargs): + self.hidden_sizes = [96, 192, 384, 768] if self.hidden_sizes is None else self.hidden_sizes + self.depths = [3, 3, 9, 3] if self.depths is None else self.depths self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) @property def num_stages(self) -> int: diff --git a/src/transformers/models/dinov3_vit/configuration_dinov3_vit.py b/src/transformers/models/dinov3_vit/configuration_dinov3_vit.py index a29ecfa80189..084a49b606fc 100644 --- a/src/transformers/models/dinov3_vit/configuration_dinov3_vit.py +++ b/src/transformers/models/dinov3_vit/configuration_dinov3_vit.py @@ -13,15 +13,15 @@ # limitations under the License. """DINOv3 model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/dinov3-vits16-pretrain-lvd1689m") +@strict(accept_kwargs=True) class DINOv3ViTConfig(BackboneConfigMixin, PreTrainedConfig): r""" rope_theta (`float`, *optional*, defaults to 100.0): @@ -75,79 +75,41 @@ class DINOv3ViTConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "dinov3_vit" - def __init__( - self, - patch_size: int = 16, - hidden_size: int = 384, - intermediate_size: int = 1536, - num_hidden_layers: int = 12, - num_attention_heads: int = 6, - hidden_act: str = "gelu", - attention_dropout: float = 0.0, - initializer_range: float = 0.02, - layer_norm_eps: float = 1e-5, - rope_theta: float = 100.0, - image_size: int = 224, - num_channels: int = 3, - query_bias: bool = True, - key_bias: bool = False, - value_bias: bool = True, - proj_bias: bool = True, - mlp_bias: bool = True, - layerscale_value: float = 1.0, - drop_path_rate: float = 0.0, - use_gated_mlp: bool = False, - num_register_tokens: int = 0, - # train augs - pos_embed_shift: float | None = None, - pos_embed_jitter: float | None = None, - pos_embed_rescale: float | None = 2.0, - out_features: list[str] | None = None, - out_indices: list[int] | None = None, - apply_layernorm: bool = True, - reshape_hidden_states: bool = True, - return_class_token: bool = False, - **kwargs, - ): - super().__init__(**kwargs) - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.layerscale_value = layerscale_value - self.drop_path_rate = drop_path_rate - self.use_gated_mlp = use_gated_mlp - self.rope_theta = rope_theta - self.query_bias = query_bias - self.key_bias = key_bias - self.value_bias = value_bias - self.proj_bias = proj_bias - self.mlp_bias = mlp_bias - self.num_register_tokens = num_register_tokens - - # train augs - self.pos_embed_shift = pos_embed_shift - self.pos_embed_jitter = pos_embed_jitter - self.pos_embed_rescale = pos_embed_rescale - # Initialize backbone-specific configuration - self.apply_layernorm = apply_layernorm - self.reshape_hidden_states = reshape_hidden_states - self.return_class_token = return_class_token - - # Initialize backbone stage names - stage_names = ["stem"] + [f"stage{i}" for i in range(1, num_hidden_layers + 1)] - self.stage_names = stage_names - - # Initialize backbone features/indices - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_size: int = 384 + intermediate_size: int = 1536 + num_hidden_layers: int = 12 + num_attention_heads: int = 6 + hidden_act: str = "gelu" + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + rope_theta: float = 100.0 + image_size: int | list[int] | tuple[int, int] = 224 + num_channels: int = 3 + query_bias: bool = True + key_bias: bool = False + value_bias: bool = True + proj_bias: bool = True + mlp_bias: bool = True + layerscale_value: float = 1.0 + drop_path_rate: float = 0.0 + use_gated_mlp: bool = False + num_register_tokens: int = 0 + pos_embed_shift: float | None = None + pos_embed_jitter: float | None = None + pos_embed_rescale: float | None = 2.0 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + apply_layernorm: bool = True + reshape_hidden_states: bool = True + + def __post_init__(self, **kwargs): + self.stage_names = ["stem"] + [f"stage{i}" for i in range(1, self.num_hidden_layers + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["DINOv3ViTConfig"] diff --git a/src/transformers/models/distilbert/configuration_distilbert.py b/src/transformers/models/distilbert/configuration_distilbert.py index d303ff401779..d9ac640471fa 100644 --- a/src/transformers/models/distilbert/configuration_distilbert.py +++ b/src/transformers/models/distilbert/configuration_distilbert.py @@ -13,14 +13,14 @@ # limitations under the License. """DistilBERT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/distilbert-base-uncased") +@strict(accept_kwargs=True) class DistilBertConfig(PreTrainedConfig): r""" sinusoidal_pos_embds (`boolean`, *optional*, defaults to `False`): @@ -55,45 +55,23 @@ class DistilBertConfig(PreTrainedConfig): "num_hidden_layers": "n_layers", } - def __init__( - self, - vocab_size=30522, - max_position_embeddings=512, - sinusoidal_pos_embds=False, - n_layers=6, - n_heads=12, - dim=768, - hidden_dim=4 * 768, - dropout=0.1, - attention_dropout=0.1, - activation="gelu", - initializer_range=0.02, - qa_dropout=0.1, - seq_classif_dropout=0.2, - pad_token_id=0, - eos_token_id=None, - bos_token_id=None, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.sinusoidal_pos_embds = sinusoidal_pos_embds - self.n_layers = n_layers - self.n_heads = n_heads - self.dim = dim - self.hidden_dim = hidden_dim - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation = activation - self.initializer_range = initializer_range - self.qa_dropout = qa_dropout - self.seq_classif_dropout = seq_classif_dropout - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.bos_token_id = bos_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vocab_size: int = 30522 + max_position_embeddings: int = 512 + sinusoidal_pos_embds: bool = False + n_layers: int = 6 + n_heads: int = 12 + dim: int = 768 + hidden_dim: int = 4 * 768 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation: str = "gelu" + initializer_range: float = 0.02 + qa_dropout: float | int = 0.1 + seq_classif_dropout: float | int = 0.2 + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = None + bos_token_id: int | None = None + tie_word_embeddings: bool = True __all__ = ["DistilBertConfig"] diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index 5acec77f6b40..cf861f30942c 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -19,12 +19,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="SmallDoge/Doge-320M") +@strict(accept_kwargs=True) class DogeConfig(PreTrainedConfig): r""" keep_window_size (`int`, *optional*, defaults to 2048): @@ -67,74 +70,41 @@ class DogeConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32768, - hidden_size: int | None = 1024, - intermediate_size: int | None = 2048, - num_hidden_layers: int | None = 32, - hidden_dropout: float | None = 0.0, - hidden_act: str | None = "silu", - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-06, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - max_position_embeddings: int | None = 2048, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - sliding_window: int | None = None, - keep_window_size: int | None = 2048, - is_moe: bool | None = False, - num_experts: int | None = 16384, - num_experts_per_tok: int | None = 64, - norm_topk_prob: bool | None = False, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - - self.hidden_dropout = hidden_dropout - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - - self.max_position_embeddings = max_position_embeddings - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.sliding_window = sliding_window - self.keep_window_size = keep_window_size - self.is_moe = is_moe - self.num_experts = num_experts - self.num_experts_per_tok = num_experts_per_tok - self.norm_topk_prob = norm_topk_prob - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.rope_parameters = rope_parameters + vocab_size: int = 32768 + hidden_size: int = 1024 + intermediate_size: int = 2048 + num_hidden_layers: int = 32 + hidden_dropout: float | int = 0.0 + hidden_act: str = "silu" + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-06 + use_cache: bool = True + tie_word_embeddings: bool = False + max_position_embeddings: int = 2048 + rope_parameters: RopeParameters | dict | None = None + num_attention_heads: int = 8 + num_key_value_heads: int | None = None + attention_bias: bool = False + attention_dropout: float | None = 0.0 + mlp_bias: bool = False + sliding_window: int | None = None + keep_window_size: int = 2048 + is_moe: bool = False + num_experts: int = 16384 + num_experts_per_tok: int = 64 + norm_topk_prob: bool = False + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + def __post_init__(self, **kwargs): # for backward compatibility - if num_key_value_heads is None: - self.num_key_value_heads = num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["DogeConfig"] diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index e1ca0b071fd1..2b75c5635101 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -21,6 +21,7 @@ import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -55,6 +56,7 @@ @auto_docstring(checkpoint="SmallDoge/Doge-320M") +@strict(accept_kwargs=True) class DogeConfig(PreTrainedConfig): r""" keep_window_size (`int`, *optional*, defaults to 2048): @@ -97,74 +99,41 @@ class DogeConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32768, - hidden_size: int | None = 1024, - intermediate_size: int | None = 2048, - num_hidden_layers: int | None = 32, - hidden_dropout: float | None = 0.0, - hidden_act: str | None = "silu", - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-06, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - max_position_embeddings: int | None = 2048, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - sliding_window: int | None = None, - keep_window_size: int | None = 2048, - is_moe: bool | None = False, - num_experts: int | None = 16384, - num_experts_per_tok: int | None = 64, - norm_topk_prob: bool | None = False, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - - self.hidden_dropout = hidden_dropout - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - - self.max_position_embeddings = max_position_embeddings - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.sliding_window = sliding_window - self.keep_window_size = keep_window_size - self.is_moe = is_moe - self.num_experts = num_experts - self.num_experts_per_tok = num_experts_per_tok - self.norm_topk_prob = norm_topk_prob - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.rope_parameters = rope_parameters - + vocab_size: int = 32768 + hidden_size: int = 1024 + intermediate_size: int = 2048 + num_hidden_layers: int = 32 + hidden_dropout: float | int = 0.0 + hidden_act: str = "silu" + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-06 + use_cache: bool = True + tie_word_embeddings: bool = False + max_position_embeddings: int = 2048 + rope_parameters: RopeParameters | dict | None = None + num_attention_heads: int = 8 + num_key_value_heads: int | None = None + attention_bias: bool = False + attention_dropout: float | None = 0.0 + mlp_bias: bool = False + sliding_window: int | None = None + keep_window_size: int = 2048 + is_moe: bool = False + num_experts: int = 16384 + num_experts_per_tok: int = 64 + norm_topk_prob: bool = False + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): # for backward compatibility - if num_key_value_heads is None: - self.num_key_value_heads = num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - super().__init__(**kwargs) + super().__post_init__(**kwargs) class DogeRMSNorm(LlamaRMSNorm): diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py index f941213c9072..08dc54796e1a 100644 --- a/src/transformers/models/donut/configuration_donut_swin.py +++ b/src/transformers/models/donut/configuration_donut_swin.py @@ -13,14 +13,14 @@ # limitations under the License. """Donut Swin Transformer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="naver-clova-ix/donut-base") +@strict(accept_kwargs=True) class DonutSwinConfig(PreTrainedConfig): r""" window_size (`int`, *optional*, defaults to 7): Size of windows. @@ -47,48 +47,29 @@ class DonutSwinConfig(PreTrainedConfig): "num_hidden_layers": "num_layers", } - def __init__( - self, - image_size=224, - patch_size=4, - num_channels=3, - embed_dim=96, - depths=[2, 2, 6, 2], - num_heads=[3, 6, 12, 24], - window_size=7, - mlp_ratio=4.0, - qkv_bias=True, - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - drop_path_rate=0.1, - hidden_act="gelu", - use_absolute_embeddings=False, - initializer_range=0.02, - layer_norm_eps=1e-5, - **kwargs, - ): - super().__init__(**kwargs) - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.embed_dim = embed_dim - self.depths = depths - self.num_layers = len(depths) - self.num_heads = num_heads - self.window_size = window_size - self.mlp_ratio = mlp_ratio - self.qkv_bias = qkv_bias - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.drop_path_rate = drop_path_rate - self.hidden_act = hidden_act - self.use_absolute_embeddings = use_absolute_embeddings - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 4 + num_channels: int = 3 + embed_dim: int = 96 + depths: list[int] | tuple[int, ...] = (2, 2, 6, 2) + num_heads: list[int] | tuple[int, ...] = (3, 6, 12, 24) + window_size: int = 7 + mlp_ratio: float = 4.0 + qkv_bias: bool = True + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + drop_path_rate: float = 0.1 + hidden_act: str = "gelu" + use_absolute_embeddings: bool = False + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + + def __post_init__(self, **kwargs): + self.num_layers = len(self.depths) # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel # this indicates the channel dimension after the last stage of the model - self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) + self.hidden_size = int(self.embed_dim * 2 ** (len(self.depths) - 1)) + super().__post_init__(**kwargs) __all__ = ["DonutSwinConfig"] diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py index e1e67df36b6a..b3521146938a 100644 --- a/src/transformers/models/donut/modeling_donut_swin.py +++ b/src/transformers/models/donut/modeling_donut_swin.py @@ -848,7 +848,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -933,7 +933,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.donut( pixel_values, diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index d9ea018ecdca..ef9ec8407528 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -12,15 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters +from ...utils import auto_docstring @auto_docstring(checkpoint="rednote-hilab/dots.llm1.base") +@strict(accept_kwargs=True) class Dots1Config(PreTrainedConfig): r""" n_group (`int`, *optional*, defaults to 1): @@ -69,68 +70,41 @@ class Dots1Config(PreTrainedConfig): "num_local_experts": "n_routed_experts", } - def __init__( - self, - vocab_size: int | None = 152064, - hidden_size: int | None = 4608, - intermediate_size: int | None = 10944, - moe_intermediate_size: int | None = 1408, - num_hidden_layers: int | None = 62, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 32, - n_shared_experts: int | None = None, - n_routed_experts: int | None = None, - n_group: int | None = 1, - topk_group: int | None = 1, - num_experts_per_tok: int | None = None, - first_k_dense_replace: int | None = 0, - norm_topk_prob: bool | None = False, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - routed_scaling_factor: float | None = 1.0, - sliding_window: int | None = 4096, - max_window_layers: int | None = 62, - layer_types: list[str] | None = None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.moe_intermediate_size = moe_intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.num_experts_per_tok = num_experts_per_tok - self.first_k_dense_replace = first_k_dense_replace - self.norm_topk_prob = norm_topk_prob - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - self.n_group = n_group - self.topk_group = topk_group - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.routed_scaling_factor = routed_scaling_factor - self.sliding_window = sliding_window - self.max_window_layers = max_window_layers - - self.layer_types = layer_types + vocab_size: int = 152064 + hidden_size: int = 4608 + intermediate_size: int = 10944 + moe_intermediate_size: int = 1408 + num_hidden_layers: int = 62 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 32 + n_shared_experts: int | None = None + n_routed_experts: int | None = None + n_group: int | None = 1 + topk_group: int | None = 1 + num_experts_per_tok: int | None = None + first_k_dense_replace: int | None = 0 + norm_topk_prob: bool | None = False + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + routed_scaling_factor: float = 1.0 + sliding_window: int | None = 4096 + max_window_layers: int | None = 62 + layer_types: list[str] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + if self.layer_types is None: self.layer_types = [ "sliding_attention" @@ -138,15 +112,8 @@ def __init__( else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Dots1Config"] diff --git a/src/transformers/models/dpr/configuration_dpr.py b/src/transformers/models/dpr/configuration_dpr.py index f2b601924760..8915686e8790 100644 --- a/src/transformers/models/dpr/configuration_dpr.py +++ b/src/transformers/models/dpr/configuration_dpr.py @@ -13,14 +13,14 @@ # limitations under the License. """DPR model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/dpr-ctx_encoder-single-nq-base") +@strict(accept_kwargs=True) class DPRConfig(PreTrainedConfig): r""" Example: @@ -40,48 +40,24 @@ class DPRConfig(PreTrainedConfig): model_type = "dpr" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=None, - eos_token_id=None, - projection_dim: int = 0, - is_decoder=False, - add_cross_attention=False, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.projection_dim = projection_dim + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + projection_dim: int = 0 + is_decoder: bool = False + add_cross_attention: bool = False __all__ = ["DPRConfig"] diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py index f8a3ce861d84..3ed9a1759db6 100644 --- a/src/transformers/models/dpr/modeling_dpr.py +++ b/src/transformers/models/dpr/modeling_dpr.py @@ -326,7 +326,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -432,7 +432,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -538,7 +538,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py index c9e0c8ca9e2b..f3e86eb4d735 100644 --- a/src/transformers/models/dpt/configuration_dpt.py +++ b/src/transformers/models/dpt/configuration_dpt.py @@ -13,16 +13,16 @@ # limitations under the License. """DPT model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto.configuration_auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="Intel/dpt-large") +@strict(accept_kwargs=True) class DPTConfig(PreTrainedConfig): r""" is_hybrid (`bool`, *optional*, defaults to `False`): @@ -84,54 +84,52 @@ class DPTConfig(PreTrainedConfig): model_type = "dpt" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - image_size=384, - patch_size=16, - num_channels=3, - is_hybrid=False, - qkv_bias=True, - backbone_out_indices=[2, 5, 8, 11], - readout_type="project", - reassemble_factors=[4, 2, 1, 0.5], - neck_hidden_sizes=[96, 192, 384, 768], - fusion_hidden_size=256, - head_in_index=-1, - use_batch_norm_in_fusion_residual=False, - use_bias_in_fusion_residual=None, - add_projection=False, - use_auxiliary_head=True, - auxiliary_loss_weight=0.4, - semantic_loss_ignore_index=255, - semantic_classifier_dropout=0.1, - backbone_featmap_shape=[1, 1024, 24, 24], - neck_ignore_stages=[0, 1], - backbone_config=None, - pooler_output_size=None, - pooler_act="tanh", - **kwargs, - ): - self.hidden_size = hidden_size - self.is_hybrid = is_hybrid - - if readout_type not in ["ignore", "add", "project"]: + # NOTE: some values are typed as `None` on purpose + # DPT creates one of: backbone or the general model only + # so official checkpoint saved them as `None` + hidden_size: int = 768 + num_hidden_layers: None | int = 12 + num_attention_heads: int | None = 12 + intermediate_size: int | None = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float | int | None = 0.0 + attention_probs_dropout_prob: float | int | None = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float | None = 1e-12 + image_size: int | list[int] | tuple[int, int] | None = 384 + patch_size: int | list[int] | tuple[int, int] | None = 16 + num_channels: int | None = 3 + is_hybrid: bool = False + qkv_bias: bool | None = True + backbone_out_indices: list[int] | tuple[int, ...] | None = (2, 5, 8, 11) + readout_type: str = "project" + reassemble_factors: list[int | float] | tuple[int | float, ...] = (4, 2, 1, 0.5) + neck_hidden_sizes: list[int] | tuple[int, ...] = (96, 192, 384, 768) + fusion_hidden_size: int = 256 + head_in_index: int = -1 + use_batch_norm_in_fusion_residual: bool | None = False + use_bias_in_fusion_residual: bool | None = None + add_projection: bool = False + use_auxiliary_head: bool | None = True + auxiliary_loss_weight: float = 0.4 + semantic_loss_ignore_index: int = 255 + semantic_classifier_dropout: float | int = 0.1 + backbone_featmap_shape: list[int] | tuple[int, ...] | None = (1, 1024, 24, 24) + neck_ignore_stages: list[int] | tuple[int, ...] = (0, 1) + backbone_config: dict | PreTrainedConfig | None = None + pooler_output_size: int | None = None + pooler_act: str = "tanh" + + def __post_init__(self, **kwargs): + if self.readout_type not in ["ignore", "add", "project"]: raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']") if self.is_hybrid: - if isinstance(backbone_config, dict): - backbone_config.setdefault("model_type", "bit") + if isinstance(self.backbone_config, dict): + self.backbone_config.setdefault("model_type", "bit") - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="bit", default_config_kwargs={ "global_padding": "same", @@ -142,51 +140,19 @@ def __init__( }, **kwargs, ) - if readout_type != "project": + if self.readout_type != "project": raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.") - elif kwargs.get("backbone") is not None or backbone_config is not None: - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + elif kwargs.get("backbone") is not None or self.backbone_config is not None: + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, **kwargs, ) - backbone_out_indices = None - - self.backbone_config = backbone_config - - # ViT parameters used if not using a hybrid backbone - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.backbone_out_indices = backbone_out_indices - self.backbone_featmap_shape = backbone_featmap_shape if is_hybrid else None - self.neck_ignore_stages = neck_ignore_stages if is_hybrid else [] - - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.readout_type = readout_type - self.reassemble_factors = reassemble_factors - self.neck_hidden_sizes = neck_hidden_sizes - self.fusion_hidden_size = fusion_hidden_size - self.head_in_index = head_in_index - self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual - self.use_bias_in_fusion_residual = use_bias_in_fusion_residual - self.add_projection = add_projection - - # auxiliary head attributes (semantic segmentation) - self.use_auxiliary_head = use_auxiliary_head - self.auxiliary_loss_weight = auxiliary_loss_weight - self.semantic_loss_ignore_index = semantic_loss_ignore_index - self.semantic_classifier_dropout = semantic_classifier_dropout - self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size - self.pooler_act = pooler_act - super().__init__(**kwargs) + self.backbone_out_indices = None + + self.backbone_featmap_shape = self.backbone_featmap_shape if self.is_hybrid else None + self.neck_ignore_stages = self.neck_ignore_stages if self.is_hybrid else [] + self.pooler_output_size = self.pooler_output_size if self.pooler_output_size else self.hidden_size + super().__post_init__(**kwargs) __all__ = ["DPTConfig"] diff --git a/src/transformers/models/edgetam/configuration_edgetam.py b/src/transformers/models/edgetam/configuration_edgetam.py index 84811636abbc..bf23b76e2ef7 100644 --- a/src/transformers/models/edgetam/configuration_edgetam.py +++ b/src/transformers/models/edgetam/configuration_edgetam.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamVisionConfig(PreTrainedConfig): r""" backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`): @@ -49,56 +52,41 @@ class EdgeTamVisionConfig(PreTrainedConfig): "backbone_config": AutoConfig, } - def __init__( - self, - backbone_config=None, - backbone_channel_list=None, - backbone_feature_sizes=None, - fpn_hidden_size=256, - fpn_kernel_size=1, - fpn_stride=1, - fpn_padding=0, - fpn_top_down_levels=None, - num_feature_levels=3, - hidden_act="gelu", - layer_norm_eps=1e-6, - initializer_range=0.02, - **kwargs, - ): - backbone_channel_list = [384, 192, 96, 48] if backbone_channel_list is None else backbone_channel_list - backbone_feature_sizes = ( - [[256, 256], [128, 128], [64, 64]] if backbone_feature_sizes is None else backbone_feature_sizes + backbone_config: dict | PreTrainedConfig | None = None + backbone_channel_list: list[int] | None = None + backbone_feature_sizes: list | None = None + fpn_hidden_size: int = 256 + fpn_kernel_size: int = 1 + fpn_stride: int = 1 + fpn_padding: int = 0 + fpn_top_down_levels: list[int] | None = None + num_feature_levels: int = 3 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + self.backbone_channel_list = ( + [384, 192, 96, 48] if self.backbone_channel_list is None else self.backbone_channel_list + ) + self.backbone_feature_sizes = ( + [[256, 256], [128, 128], [64, 64]] if self.backbone_feature_sizes is None else self.backbone_feature_sizes ) - fpn_top_down_levels = [2, 3] if fpn_top_down_levels is None else fpn_top_down_levels + self.fpn_top_down_levels = [2, 3] if self.fpn_top_down_levels is None else self.fpn_top_down_levels - if isinstance(backbone_config, dict): - backbone_config["model_type"] = backbone_config.get("model_type", "timm_wrapper") - backbone_config = CONFIG_MAPPING[backbone_config["model_type"]](**backbone_config) - elif backbone_config is None: - backbone_config = AutoConfig.from_pretrained( + if isinstance(self.backbone_config, dict): + self.backbone_config["model_type"] = self.backbone_config.get("model_type", "timm_wrapper") + self.backbone_config = CONFIG_MAPPING[self.backbone_config["model_type"]](**self.backbone_config) + elif self.backbone_config is None: + self.backbone_config = AutoConfig.from_pretrained( "timm/repvit_m1.dist_in1k", model_args={"in_chans": 3, "features_only": True, "out_indices": [0, 1, 2, 3]}, ) - - self.backbone_config = backbone_config - - # Neck - self.backbone_channel_list = backbone_channel_list - self.backbone_feature_sizes = backbone_feature_sizes - self.fpn_hidden_size = fpn_hidden_size - self.fpn_kernel_size = fpn_kernel_size - self.fpn_stride = fpn_stride - self.fpn_padding = fpn_padding - self.fpn_top_down_levels = fpn_top_down_levels - self.num_feature_levels = num_feature_levels - - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamPromptEncoderConfig(PreTrainedConfig): r""" mask_input_channels (`int`, *optional*, defaults to 16): @@ -111,30 +99,18 @@ class EdgeTamPromptEncoderConfig(PreTrainedConfig): base_config_key = "prompt_encoder_config" - def __init__( - self, - hidden_size=256, - image_size=1024, - patch_size=16, - mask_input_channels=16, - num_point_embeddings=4, - hidden_act="gelu", - layer_norm_eps=1e-6, - scale=1, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.image_size = image_size - self.patch_size = patch_size - self.mask_input_channels = mask_input_channels - self.num_point_embeddings = num_point_embeddings - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.scale = scale + hidden_size: int = 256 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + mask_input_channels: int = 16 + num_point_embeddings: int = 4 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + scale: int = 1 @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamMaskDecoderConfig(PreTrainedConfig): r""" mlp_dim (`int`, *optional*, defaults to 2048): @@ -157,42 +133,22 @@ class EdgeTamMaskDecoderConfig(PreTrainedConfig): base_config_key = "mask_decoder_config" - def __init__( - self, - hidden_size=256, - hidden_act="gelu", - mlp_dim=2048, - num_hidden_layers=2, - num_attention_heads=8, - attention_downsample_rate=2, - num_multimask_outputs=3, - iou_head_depth=3, - iou_head_hidden_dim=256, - dynamic_multimask_via_stability=True, - dynamic_multimask_stability_delta=0.05, - dynamic_multimask_stability_thresh=0.98, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_multimask_outputs = num_multimask_outputs - self.hidden_act = hidden_act - self.iou_head_depth = iou_head_depth - self.iou_head_hidden_dim = iou_head_hidden_dim - self.dynamic_multimask_via_stability = dynamic_multimask_via_stability - self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta - self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh - - # TwoWayTransformer configuration - self.num_hidden_layers = num_hidden_layers - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.mlp_dim = mlp_dim - self.attention_downsample_rate = attention_downsample_rate + hidden_size: int = 256 + hidden_act: str = "gelu" + mlp_dim: int = 2048 + num_hidden_layers: int = 2 + num_attention_heads: int = 8 + attention_downsample_rate: int = 2 + num_multimask_outputs: int = 3 + iou_head_depth: int = 3 + iou_head_hidden_dim: int = 256 + dynamic_multimask_via_stability: bool = True + dynamic_multimask_stability_delta: float = 0.05 + dynamic_multimask_stability_thresh: float = 0.98 @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamConfig(PreTrainedConfig): r""" prompt_encoder_config (Union[`dict`, `EdgeTamPromptEncoderConfig`], *optional*): @@ -236,32 +192,29 @@ class EdgeTamConfig(PreTrainedConfig): "mask_decoder_config": EdgeTamMaskDecoderConfig, } - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - **kwargs, - ): - vision_config = vision_config if vision_config is not None else {} - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "edgetam_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - if isinstance(prompt_encoder_config, EdgeTamPromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, EdgeTamMaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() - - self.vision_config = vision_config - self.prompt_encoder_config = EdgeTamPromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = EdgeTamMaskDecoderConfig(**mask_decoder_config) - - self.initializer_range = initializer_range - super().__init__(**kwargs) + vision_config: dict | PreTrainedConfig | None = None + prompt_encoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "edgetam_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["edgetam_vision_model"]() + + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = EdgeTamPromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = EdgeTamPromptEncoderConfig() + + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = EdgeTamMaskDecoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = EdgeTamMaskDecoderConfig() + + super().__post_init__(**kwargs) __all__ = ["EdgeTamConfig", "EdgeTamVisionConfig", "EdgeTamPromptEncoderConfig", "EdgeTamMaskDecoderConfig"] diff --git a/src/transformers/models/edgetam/modular_edgetam.py b/src/transformers/models/edgetam/modular_edgetam.py index 242d872809d0..61f9f123beb1 100644 --- a/src/transformers/models/edgetam/modular_edgetam.py +++ b/src/transformers/models/edgetam/modular_edgetam.py @@ -14,6 +14,7 @@ """PyTorch SAM 2 model.""" import torch +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...configuration_utils import PreTrainedConfig @@ -37,6 +38,7 @@ @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamVisionConfig(PreTrainedConfig): r""" backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`): @@ -63,66 +65,53 @@ class EdgeTamVisionConfig(PreTrainedConfig): "backbone_config": AutoConfig, } - def __init__( - self, - backbone_config=None, - backbone_channel_list=None, - backbone_feature_sizes=None, - fpn_hidden_size=256, - fpn_kernel_size=1, - fpn_stride=1, - fpn_padding=0, - fpn_top_down_levels=None, - num_feature_levels=3, - hidden_act="gelu", - layer_norm_eps=1e-6, - initializer_range=0.02, - **kwargs, - ): - backbone_channel_list = [384, 192, 96, 48] if backbone_channel_list is None else backbone_channel_list - backbone_feature_sizes = ( - [[256, 256], [128, 128], [64, 64]] if backbone_feature_sizes is None else backbone_feature_sizes + backbone_config: dict | PreTrainedConfig | None = None + backbone_channel_list: list[int] | None = None + backbone_feature_sizes: list | None = None + fpn_hidden_size: int = 256 + fpn_kernel_size: int = 1 + fpn_stride: int = 1 + fpn_padding: int = 0 + fpn_top_down_levels: list[int] | None = None + num_feature_levels: int = 3 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + self.backbone_channel_list = ( + [384, 192, 96, 48] if self.backbone_channel_list is None else self.backbone_channel_list + ) + self.backbone_feature_sizes = ( + [[256, 256], [128, 128], [64, 64]] if self.backbone_feature_sizes is None else self.backbone_feature_sizes ) - fpn_top_down_levels = [2, 3] if fpn_top_down_levels is None else fpn_top_down_levels + self.fpn_top_down_levels = [2, 3] if self.fpn_top_down_levels is None else self.fpn_top_down_levels - if isinstance(backbone_config, dict): - backbone_config["model_type"] = backbone_config.get("model_type", "timm_wrapper") - backbone_config = CONFIG_MAPPING[backbone_config["model_type"]](**backbone_config) - elif backbone_config is None: - backbone_config = AutoConfig.from_pretrained( + if isinstance(self.backbone_config, dict): + self.backbone_config["model_type"] = self.backbone_config.get("model_type", "timm_wrapper") + self.backbone_config = CONFIG_MAPPING[self.backbone_config["model_type"]](**self.backbone_config) + elif self.backbone_config is None: + self.backbone_config = AutoConfig.from_pretrained( "timm/repvit_m1.dist_in1k", model_args={"in_chans": 3, "features_only": True, "out_indices": [0, 1, 2, 3]}, ) - - self.backbone_config = backbone_config - - # Neck - self.backbone_channel_list = backbone_channel_list - self.backbone_feature_sizes = backbone_feature_sizes - self.fpn_hidden_size = fpn_hidden_size - self.fpn_kernel_size = fpn_kernel_size - self.fpn_stride = fpn_stride - self.fpn_padding = fpn_padding - self.fpn_top_down_levels = fpn_top_down_levels - self.num_feature_levels = num_feature_levels - - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamPromptEncoderConfig(Sam2PromptEncoderConfig): pass @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamMaskDecoderConfig(Sam2MaskDecoderConfig): pass @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamConfig(Sam2Config): r""" prompt_encoder_config (Union[`dict`, `EdgeTamPromptEncoderConfig`], *optional*): diff --git a/src/transformers/models/edgetam_video/configuration_edgetam_video.py b/src/transformers/models/edgetam_video/configuration_edgetam_video.py index 5bb24a873fa4..8e93ef4a03fa 100644 --- a/src/transformers/models/edgetam_video/configuration_edgetam_video.py +++ b/src/transformers/models/edgetam_video/configuration_edgetam_video.py @@ -18,12 +18,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamVideoPromptEncoderConfig(PreTrainedConfig): r""" mask_input_channels (`int`, *optional*, defaults to 16): @@ -36,30 +39,18 @@ class EdgeTamVideoPromptEncoderConfig(PreTrainedConfig): base_config_key = "prompt_encoder_config" - def __init__( - self, - hidden_size=256, - image_size=1024, - patch_size=16, - mask_input_channels=16, - num_point_embeddings=4, - hidden_act="gelu", - layer_norm_eps=1e-6, - scale=1, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.image_size = image_size - self.patch_size = patch_size - self.mask_input_channels = mask_input_channels - self.num_point_embeddings = num_point_embeddings - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.scale = scale + hidden_size: int = 256 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + mask_input_channels: int = 16 + num_point_embeddings: int = 4 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + scale: int = 1 @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamVideoMaskDecoderConfig(PreTrainedConfig): r""" mlp_dim (`int`, *optional*, defaults to 2048): @@ -82,42 +73,22 @@ class EdgeTamVideoMaskDecoderConfig(PreTrainedConfig): base_config_key = "mask_decoder_config" - def __init__( - self, - hidden_size=256, - hidden_act="gelu", - mlp_dim=2048, - num_hidden_layers=2, - num_attention_heads=8, - attention_downsample_rate=2, - num_multimask_outputs=3, - iou_head_depth=3, - iou_head_hidden_dim=256, - dynamic_multimask_via_stability=True, - dynamic_multimask_stability_delta=0.05, - dynamic_multimask_stability_thresh=0.98, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_multimask_outputs = num_multimask_outputs - self.hidden_act = hidden_act - self.iou_head_depth = iou_head_depth - self.iou_head_hidden_dim = iou_head_hidden_dim - self.dynamic_multimask_via_stability = dynamic_multimask_via_stability - self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta - self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh - - # TwoWayTransformer configuration - self.num_hidden_layers = num_hidden_layers - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.mlp_dim = mlp_dim - self.attention_downsample_rate = attention_downsample_rate + hidden_size: int = 256 + hidden_act: str = "gelu" + mlp_dim: int = 2048 + num_hidden_layers: int = 2 + num_attention_heads: int = 8 + attention_downsample_rate: int = 2 + num_multimask_outputs: int = 3 + iou_head_depth: int = 3 + iou_head_hidden_dim: int = 256 + dynamic_multimask_via_stability: bool = True + dynamic_multimask_stability_delta: float = 0.05 + dynamic_multimask_stability_thresh: float = 0.98 @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamVideoConfig(PreTrainedConfig): r""" prompt_encoder_config (Union[`dict`, `EdgeTamVideoPromptEncoderConfig`], *optional*): @@ -254,141 +225,90 @@ class EdgeTamVideoConfig(PreTrainedConfig): "mask_decoder_config": EdgeTamVideoMaskDecoderConfig, } - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - num_maskmem=7, - image_size=1024, - sigmoid_scale_for_mem_enc=20.0, - sigmoid_bias_for_mem_enc=-10.0, - enable_occlusion_spatial_embedding=True, - multimask_output_in_sam=True, - multimask_min_pt_num=0, - multimask_max_pt_num=1, - multimask_output_for_tracking=True, - max_object_pointers_in_encoder=16, - max_cond_frame_num=-1, - enable_temporal_pos_encoding_for_object_pointers=True, - # memory attention - memory_attention_hidden_size=256, - memory_attention_num_layers=2, - memory_attention_num_attention_heads=1, - memory_attention_downsample_rate=1, - memory_attention_mlp_hidden_size=2048, - memory_attention_mlp_hidden_act="relu", - memory_attention_dropout=0.1, - memory_attention_rope_theta=10000, - memory_attention_rope_feat_sizes=None, - memory_attention_rope_k_sizes=None, - memory_attention_rope_dropout=0.1, - # spatial perceiver resampler - perceiver_resampler_num_latents=256, - perceiver_resampler_num_latents_2d=256, - perceiver_resampler_hidden_size=64, - perceiver_resampler_mlp_intermediate_size=256, - perceiver_resampler_num_attention_heads=1, - perceiver_resampler_attention_head_dim=64, - perceiver_resampler_num_layers=2, - perceiver_resampler_hidden_dropout=0.0, - perceiver_resampler_attention_dropout=0.0, - # memory encoder - memory_encoder_hidden_size=256, - memory_encoder_output_channels=64, - mask_downsampler_embed_dim=256, - memory_fuser_intermediate_dim=1024, - mask_downsampler_kernel_size=3, - mask_downsampler_stride=2, - mask_downsampler_padding=1, - mask_downsampler_total_stride=16, - mask_downsampler_hidden_act="gelu", - memory_fuser_num_layers=2, - memory_fuser_embed_dim=256, - memory_fuser_kernel_size=7, - memory_fuser_padding=3, - memory_fuser_layer_scale_init_value=1e-6, - memory_fuser_hidden_act="gelu", - **kwargs, - ): - super().__init__(**kwargs) - vision_config = vision_config if vision_config is not None else {} - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - memory_attention_rope_feat_sizes = ( - [64, 64] if memory_attention_rope_feat_sizes is None else memory_attention_rope_feat_sizes - ) - memory_attention_rope_k_sizes = ( - [16, 16] if memory_attention_rope_k_sizes is None else memory_attention_rope_k_sizes - ) + vision_config: dict | PreTrainedConfig | None = None + prompt_encoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + num_maskmem: int = 7 + image_size: int | list[int] | tuple[int, int] = 1024 + sigmoid_scale_for_mem_enc: float = 20.0 + sigmoid_bias_for_mem_enc: float = -10.0 + enable_occlusion_spatial_embedding: bool = True + multimask_output_in_sam: bool = True + multimask_min_pt_num: int = 0 + multimask_max_pt_num: int = 1 + multimask_output_for_tracking: bool = True + max_object_pointers_in_encoder: int = 16 + max_cond_frame_num: int = -1 + enable_temporal_pos_encoding_for_object_pointers: bool = True + + # memory attention + memory_attention_hidden_size: int = 256 + memory_attention_num_layers: int = 2 + memory_attention_num_attention_heads: int = 1 + memory_attention_downsample_rate: int = 1 + memory_attention_mlp_hidden_size: int = 2048 + memory_attention_mlp_hidden_act: str = "relu" + memory_attention_dropout: float | int = 0.1 + memory_attention_rope_theta: float | int = 10000 + memory_attention_rope_feat_sizes: list | None = None + memory_attention_rope_k_sizes: list | None = None + memory_attention_rope_dropout: float | int = 0.1 - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "sam2_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - if isinstance(prompt_encoder_config, EdgeTamVideoPromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, EdgeTamVideoMaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() + # spatial perceiver resampler + perceiver_resampler_num_latents: int = 256 + perceiver_resampler_num_latents_2d: int = 256 + perceiver_resampler_hidden_size: int = 64 + perceiver_resampler_mlp_intermediate_size: int = 256 + perceiver_resampler_num_attention_heads: int = 1 + perceiver_resampler_attention_head_dim: int = 64 + perceiver_resampler_num_layers: int = 2 + perceiver_resampler_hidden_dropout: float | int = 0.0 + perceiver_resampler_attention_dropout: float | int = 0.0 - self.vision_config = vision_config - self.prompt_encoder_config = EdgeTamVideoPromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = EdgeTamVideoMaskDecoderConfig(**mask_decoder_config) + # memory encoder + memory_encoder_hidden_size: int = 256 + memory_encoder_output_channels: int = 64 + mask_downsampler_embed_dim: int = 256 + memory_fuser_intermediate_dim: int = 1024 + mask_downsampler_kernel_size: int = 3 + mask_downsampler_stride: int = 2 + mask_downsampler_padding: int = 1 + mask_downsampler_total_stride: int = 16 + mask_downsampler_hidden_act: str = "gelu" + memory_fuser_num_layers: int = 2 + memory_fuser_embed_dim: int = 256 + memory_fuser_kernel_size: int = 7 + memory_fuser_padding: int = 3 + memory_fuser_layer_scale_init_value: float = 1e-6 + memory_fuser_hidden_act: str = "gelu" - self.initializer_range = initializer_range - self.num_maskmem = num_maskmem # default 1 input frame + 6 previous frames - self.image_size = image_size - self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc # scale factor for mask sigmoid prob - self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc # bias factor for mask sigmoid prob - self.enable_occlusion_spatial_embedding = enable_occlusion_spatial_embedding - self.multimask_output_in_sam = multimask_output_in_sam - self.multimask_min_pt_num = multimask_min_pt_num - self.multimask_max_pt_num = multimask_max_pt_num - self.multimask_output_for_tracking = multimask_output_for_tracking - self.max_object_pointers_in_encoder = max_object_pointers_in_encoder - self.max_cond_frame_num = max_cond_frame_num - self.enable_temporal_pos_encoding_for_object_pointers = enable_temporal_pos_encoding_for_object_pointers + def __post_init__(self, **kwargs): + self.prompt_encoder_config = self.prompt_encoder_config if self.prompt_encoder_config is not None else {} + self.mask_decoder_config = self.mask_decoder_config if self.mask_decoder_config is not None else {} + self.memory_attention_rope_feat_sizes = ( + [64, 64] if self.memory_attention_rope_feat_sizes is None else self.memory_attention_rope_feat_sizes + ) + self.memory_attention_rope_k_sizes = ( + [16, 16] if self.memory_attention_rope_k_sizes is None else self.memory_attention_rope_k_sizes + ) - # memory attention - self.memory_attention_hidden_size = memory_attention_hidden_size - self.memory_attention_num_layers = memory_attention_num_layers - self.memory_attention_num_attention_heads = memory_attention_num_attention_heads - self.memory_attention_downsample_rate = memory_attention_downsample_rate - self.memory_attention_mlp_hidden_size = memory_attention_mlp_hidden_size - self.memory_attention_mlp_hidden_act = memory_attention_mlp_hidden_act - self.memory_attention_dropout = memory_attention_dropout - self.memory_attention_rope_theta = memory_attention_rope_theta - self.memory_attention_rope_feat_sizes = memory_attention_rope_feat_sizes - self.memory_attention_rope_k_sizes = memory_attention_rope_k_sizes - self.memory_attention_rope_dropout = memory_attention_rope_dropout + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "sam2_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["sam2_vision_model"](**self.vision_config) - # spatial perceiver resampler - self.perceiver_resampler_num_latents = perceiver_resampler_num_latents - self.perceiver_resampler_num_latents_2d = perceiver_resampler_num_latents_2d - self.perceiver_resampler_hidden_size = perceiver_resampler_hidden_size - self.perceiver_resampler_mlp_intermediate_size = perceiver_resampler_mlp_intermediate_size - self.perceiver_resampler_attention_head_dim = perceiver_resampler_attention_head_dim - self.perceiver_resampler_num_attention_heads = perceiver_resampler_num_attention_heads - self.perceiver_resampler_num_layers = perceiver_resampler_num_layers - self.perceiver_resampler_hidden_dropout = perceiver_resampler_hidden_dropout - self.perceiver_resampler_attention_dropout = perceiver_resampler_attention_dropout + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = EdgeTamVideoPromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = EdgeTamVideoPromptEncoderConfig() - # memory encoder - self.memory_encoder_hidden_size = memory_encoder_hidden_size - self.memory_encoder_output_channels = memory_encoder_output_channels - self.mask_downsampler_embed_dim = mask_downsampler_embed_dim - self.mask_downsampler_kernel_size = mask_downsampler_kernel_size - self.mask_downsampler_stride = mask_downsampler_stride - self.mask_downsampler_padding = mask_downsampler_padding - self.mask_downsampler_total_stride = mask_downsampler_total_stride - self.mask_downsampler_hidden_act = mask_downsampler_hidden_act - self.memory_fuser_num_layers = memory_fuser_num_layers - self.memory_fuser_embed_dim = memory_fuser_embed_dim - self.memory_fuser_intermediate_dim = memory_fuser_intermediate_dim - self.memory_fuser_kernel_size = memory_fuser_kernel_size - self.memory_fuser_padding = memory_fuser_padding - self.memory_fuser_layer_scale_init_value = memory_fuser_layer_scale_init_value - self.memory_fuser_hidden_act = memory_fuser_hidden_act + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = EdgeTamVideoMaskDecoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = EdgeTamVideoMaskDecoderConfig() + super().__post_init__(**kwargs) __all__ = ["EdgeTamVideoMaskDecoderConfig", "EdgeTamVideoPromptEncoderConfig", "EdgeTamVideoConfig"] diff --git a/src/transformers/models/edgetam_video/modular_edgetam_video.py b/src/transformers/models/edgetam_video/modular_edgetam_video.py index 2418783f4e29..d1a1298e7907 100644 --- a/src/transformers/models/edgetam_video/modular_edgetam_video.py +++ b/src/transformers/models/edgetam_video/modular_edgetam_video.py @@ -19,6 +19,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import Tensor from ... import initialization as init @@ -33,7 +34,6 @@ from ..auto import CONFIG_MAPPING, AutoConfig from ..sam2.modeling_sam2 import eager_attention_forward, window_partition from ..sam2_video.configuration_sam2_video import ( - Sam2VideoConfig, Sam2VideoMaskDecoderConfig, Sam2VideoPromptEncoderConfig, ) @@ -58,17 +58,20 @@ @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamVideoPromptEncoderConfig(Sam2VideoPromptEncoderConfig): pass @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") +@strict(accept_kwargs=True) class EdgeTamVideoMaskDecoderConfig(Sam2VideoMaskDecoderConfig): pass @auto_docstring(checkpoint="yonigozlan/EdgeTAM-hf") -class EdgeTamVideoConfig(Sam2VideoConfig): +@strict(accept_kwargs=True) +class EdgeTamVideoConfig(PreTrainedConfig): r""" prompt_encoder_config (Union[`dict`, `EdgeTamVideoPromptEncoderConfig`], *optional*): Dictionary of configuration options used to initialize [`EdgeTamVideoPromptEncoderConfig`]. @@ -204,141 +207,90 @@ class EdgeTamVideoConfig(Sam2VideoConfig): "mask_decoder_config": EdgeTamVideoMaskDecoderConfig, } - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - num_maskmem=7, - image_size=1024, - sigmoid_scale_for_mem_enc=20.0, - sigmoid_bias_for_mem_enc=-10.0, - enable_occlusion_spatial_embedding=True, - multimask_output_in_sam=True, - multimask_min_pt_num=0, - multimask_max_pt_num=1, - multimask_output_for_tracking=True, - max_object_pointers_in_encoder=16, - max_cond_frame_num=-1, - enable_temporal_pos_encoding_for_object_pointers=True, - # memory attention - memory_attention_hidden_size=256, - memory_attention_num_layers=2, - memory_attention_num_attention_heads=1, - memory_attention_downsample_rate=1, - memory_attention_mlp_hidden_size=2048, - memory_attention_mlp_hidden_act="relu", - memory_attention_dropout=0.1, - memory_attention_rope_theta=10000, - memory_attention_rope_feat_sizes=None, - memory_attention_rope_k_sizes=None, - memory_attention_rope_dropout=0.1, - # spatial perceiver resampler - perceiver_resampler_num_latents=256, - perceiver_resampler_num_latents_2d=256, - perceiver_resampler_hidden_size=64, - perceiver_resampler_mlp_intermediate_size=256, - perceiver_resampler_num_attention_heads=1, - perceiver_resampler_attention_head_dim=64, - perceiver_resampler_num_layers=2, - perceiver_resampler_hidden_dropout=0.0, - perceiver_resampler_attention_dropout=0.0, - # memory encoder - memory_encoder_hidden_size=256, - memory_encoder_output_channels=64, - mask_downsampler_embed_dim=256, - memory_fuser_intermediate_dim=1024, - mask_downsampler_kernel_size=3, - mask_downsampler_stride=2, - mask_downsampler_padding=1, - mask_downsampler_total_stride=16, - mask_downsampler_hidden_act="gelu", - memory_fuser_num_layers=2, - memory_fuser_embed_dim=256, - memory_fuser_kernel_size=7, - memory_fuser_padding=3, - memory_fuser_layer_scale_init_value=1e-6, - memory_fuser_hidden_act="gelu", - **kwargs, - ): - PreTrainedConfig.__init__(**kwargs) - vision_config = vision_config if vision_config is not None else {} - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - memory_attention_rope_feat_sizes = ( - [64, 64] if memory_attention_rope_feat_sizes is None else memory_attention_rope_feat_sizes + vision_config: dict | PreTrainedConfig | None = None + prompt_encoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + num_maskmem: int = 7 + image_size: int | list[int] | tuple[int, int] = 1024 + sigmoid_scale_for_mem_enc: float = 20.0 + sigmoid_bias_for_mem_enc: float = -10.0 + enable_occlusion_spatial_embedding: bool = True + multimask_output_in_sam: bool = True + multimask_min_pt_num: int = 0 + multimask_max_pt_num: int = 1 + multimask_output_for_tracking: bool = True + max_object_pointers_in_encoder: int = 16 + max_cond_frame_num: int = -1 + enable_temporal_pos_encoding_for_object_pointers: bool = True + + # memory attention + memory_attention_hidden_size: int = 256 + memory_attention_num_layers: int = 2 + memory_attention_num_attention_heads: int = 1 + memory_attention_downsample_rate: int = 1 + memory_attention_mlp_hidden_size: int = 2048 + memory_attention_mlp_hidden_act: str = "relu" + memory_attention_dropout: float | int = 0.1 + memory_attention_rope_theta: float | int = 10000 + memory_attention_rope_feat_sizes: list | None = None + memory_attention_rope_k_sizes: list | None = None + memory_attention_rope_dropout: float | int = 0.1 + + # spatial perceiver resampler + perceiver_resampler_num_latents: int = 256 + perceiver_resampler_num_latents_2d: int = 256 + perceiver_resampler_hidden_size: int = 64 + perceiver_resampler_mlp_intermediate_size: int = 256 + perceiver_resampler_num_attention_heads: int = 1 + perceiver_resampler_attention_head_dim: int = 64 + perceiver_resampler_num_layers: int = 2 + perceiver_resampler_hidden_dropout: float | int = 0.0 + perceiver_resampler_attention_dropout: float | int = 0.0 + + # memory encoder + memory_encoder_hidden_size: int = 256 + memory_encoder_output_channels: int = 64 + mask_downsampler_embed_dim: int = 256 + memory_fuser_intermediate_dim: int = 1024 + mask_downsampler_kernel_size: int = 3 + mask_downsampler_stride: int = 2 + mask_downsampler_padding: int = 1 + mask_downsampler_total_stride: int = 16 + mask_downsampler_hidden_act: str = "gelu" + memory_fuser_num_layers: int = 2 + memory_fuser_embed_dim: int = 256 + memory_fuser_kernel_size: int = 7 + memory_fuser_padding: int = 3 + memory_fuser_layer_scale_init_value: float = 1e-6 + memory_fuser_hidden_act: str = "gelu" + + def __post_init__(self, **kwargs): + self.prompt_encoder_config = self.prompt_encoder_config if self.prompt_encoder_config is not None else {} + self.mask_decoder_config = self.mask_decoder_config if self.mask_decoder_config is not None else {} + self.memory_attention_rope_feat_sizes = ( + [64, 64] if self.memory_attention_rope_feat_sizes is None else self.memory_attention_rope_feat_sizes ) - memory_attention_rope_k_sizes = ( - [16, 16] if memory_attention_rope_k_sizes is None else memory_attention_rope_k_sizes + self.memory_attention_rope_k_sizes = ( + [16, 16] if self.memory_attention_rope_k_sizes is None else self.memory_attention_rope_k_sizes ) - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "sam2_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - if isinstance(prompt_encoder_config, EdgeTamVideoPromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, EdgeTamVideoMaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() - - self.vision_config = vision_config - self.prompt_encoder_config = EdgeTamVideoPromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = EdgeTamVideoMaskDecoderConfig(**mask_decoder_config) - - self.initializer_range = initializer_range - self.num_maskmem = num_maskmem # default 1 input frame + 6 previous frames - self.image_size = image_size - self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc # scale factor for mask sigmoid prob - self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc # bias factor for mask sigmoid prob - self.enable_occlusion_spatial_embedding = enable_occlusion_spatial_embedding - self.multimask_output_in_sam = multimask_output_in_sam - self.multimask_min_pt_num = multimask_min_pt_num - self.multimask_max_pt_num = multimask_max_pt_num - self.multimask_output_for_tracking = multimask_output_for_tracking - self.max_object_pointers_in_encoder = max_object_pointers_in_encoder - self.max_cond_frame_num = max_cond_frame_num - self.enable_temporal_pos_encoding_for_object_pointers = enable_temporal_pos_encoding_for_object_pointers - - # memory attention - self.memory_attention_hidden_size = memory_attention_hidden_size - self.memory_attention_num_layers = memory_attention_num_layers - self.memory_attention_num_attention_heads = memory_attention_num_attention_heads - self.memory_attention_downsample_rate = memory_attention_downsample_rate - self.memory_attention_mlp_hidden_size = memory_attention_mlp_hidden_size - self.memory_attention_mlp_hidden_act = memory_attention_mlp_hidden_act - self.memory_attention_dropout = memory_attention_dropout - self.memory_attention_rope_theta = memory_attention_rope_theta - self.memory_attention_rope_feat_sizes = memory_attention_rope_feat_sizes - self.memory_attention_rope_k_sizes = memory_attention_rope_k_sizes - self.memory_attention_rope_dropout = memory_attention_rope_dropout - - # spatial perceiver resampler - self.perceiver_resampler_num_latents = perceiver_resampler_num_latents - self.perceiver_resampler_num_latents_2d = perceiver_resampler_num_latents_2d - self.perceiver_resampler_hidden_size = perceiver_resampler_hidden_size - self.perceiver_resampler_mlp_intermediate_size = perceiver_resampler_mlp_intermediate_size - self.perceiver_resampler_attention_head_dim = perceiver_resampler_attention_head_dim - self.perceiver_resampler_num_attention_heads = perceiver_resampler_num_attention_heads - self.perceiver_resampler_num_layers = perceiver_resampler_num_layers - self.perceiver_resampler_hidden_dropout = perceiver_resampler_hidden_dropout - self.perceiver_resampler_attention_dropout = perceiver_resampler_attention_dropout - - # memory encoder - self.memory_encoder_hidden_size = memory_encoder_hidden_size - self.memory_encoder_output_channels = memory_encoder_output_channels - self.mask_downsampler_embed_dim = mask_downsampler_embed_dim - self.mask_downsampler_kernel_size = mask_downsampler_kernel_size - self.mask_downsampler_stride = mask_downsampler_stride - self.mask_downsampler_padding = mask_downsampler_padding - self.mask_downsampler_total_stride = mask_downsampler_total_stride - self.mask_downsampler_hidden_act = mask_downsampler_hidden_act - self.memory_fuser_num_layers = memory_fuser_num_layers - self.memory_fuser_embed_dim = memory_fuser_embed_dim - self.memory_fuser_intermediate_dim = memory_fuser_intermediate_dim - self.memory_fuser_kernel_size = memory_fuser_kernel_size - self.memory_fuser_padding = memory_fuser_padding - self.memory_fuser_layer_scale_init_value = memory_fuser_layer_scale_init_value - self.memory_fuser_hidden_act = memory_fuser_hidden_act + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "sam2_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["sam2_vision_model"](**self.vision_config) + + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = EdgeTamVideoPromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = EdgeTamVideoPromptEncoderConfig() + + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = EdgeTamVideoMaskDecoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = EdgeTamVideoMaskDecoderConfig() + super().__post_init__(**kwargs) class EdgeTamVideoLayerNorm(Sam2VideoLayerNorm): diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index f4cf99eccc5b..2c59e69f9fc7 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="zju-community/efficientloftr") +@strict(accept_kwargs=True) class EfficientLoFTRConfig(PreTrainedConfig): r""" stage_num_blocks (`List`, *optional*, defaults to [1, 2, 4, 14]): @@ -69,38 +73,36 @@ class EfficientLoFTRConfig(PreTrainedConfig): model_type = "efficientloftr" - def __init__( - self, - stage_num_blocks: list[int] | None = None, - out_features: list[int] | None = None, - stage_stride: list[int] | None = None, - hidden_size: int = 256, - activation_function: str = "relu", - q_aggregation_kernel_size: int = 4, - kv_aggregation_kernel_size: int = 4, - q_aggregation_stride: int = 4, - kv_aggregation_stride: int = 4, - num_attention_layers: int = 4, - num_attention_heads: int = 8, - attention_dropout: float = 0.0, - attention_bias: bool = False, - mlp_activation_function: str = "leaky_relu", - coarse_matching_skip_softmax: bool = False, - coarse_matching_threshold: float = 0.2, - coarse_matching_temperature: float = 0.1, - coarse_matching_border_removal: int = 2, - fine_kernel_size: int = 8, - batch_norm_eps: float = 1e-5, - rope_parameters: dict | None = None, - fine_matching_slice_dim: int = 8, - fine_matching_regress_temperature: float = 10.0, - initializer_range: float = 0.02, - **kwargs, - ): + stage_num_blocks: list[int] | None = None + out_features: list[int] | None = None + stage_stride: list[int] | None = None + hidden_size: int = 256 + activation_function: str = "relu" + q_aggregation_kernel_size: int = 4 + kv_aggregation_kernel_size: int = 4 + q_aggregation_stride: int = 4 + kv_aggregation_stride: int = 4 + num_attention_layers: int = 4 + num_attention_heads: int = 8 + attention_dropout: float | int = 0.0 + attention_bias: bool = False + mlp_activation_function: str = "leaky_relu" + coarse_matching_skip_softmax: bool = False + coarse_matching_threshold: float = 0.2 + coarse_matching_temperature: float = 0.1 + coarse_matching_border_removal: int = 2 + fine_kernel_size: int = 8 + batch_norm_eps: float = 1e-5 + rope_parameters: dict | None = None + fine_matching_slice_dim: int = 8 + fine_matching_regress_temperature: float = 10.0 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): # Stage level of RepVGG - self.stage_num_blocks = stage_num_blocks if stage_num_blocks is not None else [1, 2, 4, 14] - self.stage_stride = stage_stride if stage_stride is not None else [2, 1, 2, 2] - self.out_features = out_features if out_features is not None else [64, 64, 128, 256] + self.stage_num_blocks = self.stage_num_blocks if self.stage_num_blocks is not None else [1, 2, 4, 14] + self.stage_stride = self.stage_stride if self.stage_stride is not None else [2, 1, 2, 2] + self.out_features = self.out_features if self.out_features is not None else [64, 64, 128, 256] self.stage_in_channels = [1] + self.out_features[:-1] # Block level of RepVGG @@ -115,41 +117,18 @@ def __init__( for stage_idx in range(len(self.stage_num_blocks)) ] - # Fine matching level of EfficientLoFTR + self.num_key_value_heads = self.num_attention_heads self.fine_fusion_dims = list(reversed(self.out_features))[:-1] + self.intermediate_size = self.hidden_size * 2 + kwargs.setdefault("partial_rotary_factor", 4.0) # assign default for BC + super().__post_init__(**kwargs) - self.hidden_size = hidden_size + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if self.hidden_size != self.out_features[-1]: raise ValueError( f"hidden_size should be equal to the last value in out_features. hidden_size = {self.hidden_size}, out_features = {self.out_features[-1]}" ) - self.activation_function = activation_function - self.q_aggregation_kernel_size = q_aggregation_kernel_size - self.kv_aggregation_kernel_size = kv_aggregation_kernel_size - self.q_aggregation_stride = q_aggregation_stride - self.kv_aggregation_stride = kv_aggregation_stride - self.num_attention_layers = num_attention_layers - self.num_attention_heads = num_attention_heads - self.attention_dropout = attention_dropout - self.attention_bias = attention_bias - self.intermediate_size = self.hidden_size * 2 - self.mlp_activation_function = mlp_activation_function - self.coarse_matching_skip_softmax = coarse_matching_skip_softmax - self.coarse_matching_threshold = coarse_matching_threshold - self.coarse_matching_temperature = coarse_matching_temperature - self.coarse_matching_border_removal = coarse_matching_border_removal - self.fine_kernel_size = fine_kernel_size - self.batch_norm_eps = batch_norm_eps - self.fine_matching_slice_dim = fine_matching_slice_dim - self.fine_matching_regress_temperature = fine_matching_regress_temperature - - self.num_key_value_heads = num_attention_heads - self.initializer_range = initializer_range - self.rope_parameters = rope_parameters - kwargs.setdefault("partial_rotary_factor", 4.0) # assign default for BC - - super().__init__(**kwargs) - __all__ = ["EfficientLoFTRConfig"] diff --git a/src/transformers/models/efficientnet/configuration_efficientnet.py b/src/transformers/models/efficientnet/configuration_efficientnet.py index e27f64cb9a90..a24ca80e8631 100644 --- a/src/transformers/models/efficientnet/configuration_efficientnet.py +++ b/src/transformers/models/efficientnet/configuration_efficientnet.py @@ -13,14 +13,14 @@ # limitations under the License. """EfficientNet model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/efficientnet-b7") +@strict(accept_kwargs=True) class EfficientNetConfig(PreTrainedConfig): r""" width_coefficient (`float`, *optional*, defaults to 2.0): @@ -65,55 +65,31 @@ class EfficientNetConfig(PreTrainedConfig): model_type = "efficientnet" - def __init__( - self, - num_channels: int = 3, - image_size: int = 600, - width_coefficient: float = 2.0, - depth_coefficient: float = 3.1, - depth_divisor: int = 8, - kernel_sizes: list[int] = [3, 3, 5, 3, 5, 5, 3], - in_channels: list[int] = [32, 16, 24, 40, 80, 112, 192], - out_channels: list[int] = [16, 24, 40, 80, 112, 192, 320], - depthwise_padding: list[int] = [], - strides: list[int] = [1, 2, 2, 2, 1, 2, 1], - num_block_repeats: list[int] = [1, 2, 2, 3, 3, 4, 1], - expand_ratios: list[int] = [1, 6, 6, 6, 6, 6, 6], - squeeze_expansion_ratio: float = 0.25, - hidden_act: str = "swish", - hidden_dim: int = 2560, - pooling_type: str = "mean", - initializer_range: float = 0.02, - batch_norm_eps: float = 0.001, - batch_norm_momentum: float = 0.99, - dropout_rate: float = 0.5, - drop_connect_rate: float = 0.2, - **kwargs, - ): - super().__init__(**kwargs) + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 600 + width_coefficient: float = 2.0 + depth_coefficient: float = 3.1 + depth_divisor: int = 8 + kernel_sizes: list[int] | tuple[int, ...] = (3, 3, 5, 3, 5, 5, 3) + in_channels: list[int] | tuple[int, ...] = (32, 16, 24, 40, 80, 112, 192) + out_channels: list[int] | tuple[int, ...] = (16, 24, 40, 80, 112, 192, 320) + depthwise_padding: list[int] | tuple[int, ...] = () + strides: list[int] | tuple[int, ...] = (1, 2, 2, 2, 1, 2, 1) + num_block_repeats: list[int] | tuple[int, ...] = (1, 2, 2, 3, 3, 4, 1) + expand_ratios: list[int] | tuple[int, ...] = (1, 6, 6, 6, 6, 6, 6) + squeeze_expansion_ratio: float = 0.25 + hidden_act: str = "swish" + hidden_dim: int = 2560 + pooling_type: str = "mean" + initializer_range: float = 0.02 + batch_norm_eps: float = 0.001 + batch_norm_momentum: float = 0.99 + dropout_rate: float = 0.5 + drop_connect_rate: float = 0.2 - self.num_channels = num_channels - self.image_size = image_size - self.width_coefficient = width_coefficient - self.depth_coefficient = depth_coefficient - self.depth_divisor = depth_divisor - self.kernel_sizes = kernel_sizes - self.in_channels = in_channels - self.out_channels = out_channels - self.depthwise_padding = depthwise_padding - self.strides = strides - self.num_block_repeats = num_block_repeats - self.expand_ratios = expand_ratios - self.squeeze_expansion_ratio = squeeze_expansion_ratio - self.hidden_act = hidden_act - self.hidden_dim = hidden_dim - self.pooling_type = pooling_type - self.initializer_range = initializer_range - self.batch_norm_eps = batch_norm_eps - self.batch_norm_momentum = batch_norm_momentum - self.dropout_rate = dropout_rate - self.drop_connect_rate = drop_connect_rate - self.num_hidden_layers = sum(num_block_repeats) * 4 + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) + self.num_hidden_layers = sum(self.num_block_repeats) * 4 __all__ = ["EfficientNetConfig"] diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py index 5a058ea17883..5a71c95205c5 100644 --- a/src/transformers/models/efficientnet/modeling_efficientnet.py +++ b/src/transformers/models/efficientnet/modeling_efficientnet.py @@ -478,7 +478,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -540,7 +540,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.efficientnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) diff --git a/src/transformers/models/electra/configuration_electra.py b/src/transformers/models/electra/configuration_electra.py index 7e998d498f14..45608e03b72c 100644 --- a/src/transformers/models/electra/configuration_electra.py +++ b/src/transformers/models/electra/configuration_electra.py @@ -14,14 +14,14 @@ # limitations under the License. """ELECTRA model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/electra-small-discriminator") +@strict(accept_kwargs=True) class ElectraConfig(PreTrainedConfig): r""" summary_type (`str`, *optional*, defaults to `"first"`): @@ -59,63 +59,31 @@ class ElectraConfig(PreTrainedConfig): model_type = "electra" - def __init__( - self, - vocab_size=30522, - embedding_size=128, - hidden_size=256, - num_hidden_layers=12, - num_attention_heads=4, - intermediate_size=1024, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - summary_type="first", - summary_use_proj=True, - summary_activation="gelu", - summary_last_dropout=0.1, - pad_token_id=0, - use_cache=True, - classifier_dropout=None, - is_decoder=False, - add_cross_attention=False, - bos_token_id=None, - eos_token_id=None, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.embedding_size = embedding_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_last_dropout = summary_last_dropout - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout + vocab_size: int = 30522 + embedding_size: int = 128 + hidden_size: int = 256 + num_hidden_layers: int = 12 + num_attention_heads: int = 4 + intermediate_size: int = 1024 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + summary_type: str = "first" + summary_use_proj: bool = True + summary_activation: str = "gelu" + summary_last_dropout: float | int = 0.1 + pad_token_id: int | None = 0 + use_cache: bool = True + classifier_dropout: float | int | None = None + is_decoder: bool = False + add_cross_attention: bool = False + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = True __all__ = ["ElectraConfig"] diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index 2df67095dc23..6c9ca4be6a44 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -14,12 +14,15 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="Emu3-community/Emu3-Chat-hf") +@strict(accept_kwargs=True) class Emu3VQVAEConfig(PreTrainedConfig): r""" out_channels (`int`, *optional*, defaults to 3): @@ -53,43 +56,24 @@ class Emu3VQVAEConfig(PreTrainedConfig): model_type = "emu3_vqgan" base_config_key = "vq_config" - def __init__( - self, - codebook_size: int = 32768, - embed_dim: int = 4, - latent_channels: int = 4, - double_latent: bool = False, - in_channels: int = 3, - out_channels: int = 3, - temporal_downsample_factor: int = 4, - base_channels: int = 256, - channel_multiplier: list[int] = [1, 2, 2, 4], - num_res_blocks: int = 2, - attn_resolutions: list[int] = [3], - hidden_size: int = 1024, - num_attention_heads: int = 1, - attention_dropout: float = 0.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.codebook_size = codebook_size - self.embed_dim = embed_dim - self.latent_channels = latent_channels - self.double_latent = double_latent - self.in_channels = in_channels - self.out_channels = out_channels - self.temporal_downsample_factor = temporal_downsample_factor - self.base_channels = base_channels - self.channel_multiplier = channel_multiplier - self.num_res_blocks = num_res_blocks - self.attn_resolutions = attn_resolutions - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.attention_dropout = attention_dropout + codebook_size: int = 32768 + embed_dim: int = 4 + latent_channels: int = 4 + double_latent: bool = False + in_channels: int = 3 + out_channels: int = 3 + temporal_downsample_factor: int = 4 + base_channels: int = 256 + channel_multiplier: list[int] | tuple[int, ...] = (1, 2, 2, 4) + num_res_blocks: int = 2 + attn_resolutions: list[int] | tuple[int, ...] = (3,) + hidden_size: int = 1024 + num_attention_heads: int = 1 + attention_dropout: float | int = 0.0 @auto_docstring(checkpoint="Emu3-community/Emu3-Chat-hf") +@strict(accept_kwargs=True) class Emu3TextConfig(PreTrainedConfig): r""" Example: @@ -112,53 +96,29 @@ class Emu3TextConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] default_theta = 1000000.0 - def __init__( - self, - vocab_size: int = 184622, - hidden_size: int = 4096, - intermediate_size: int = 14336, - num_hidden_layers: int = 32, - num_attention_heads: int = 32, - num_key_value_heads: int | None = 8, - hidden_act: str = "silu", - max_position_embeddings: int = 9216, - rms_norm_eps: float = 1e-5, - use_cache: bool = True, - pad_token_id: int = 151643, - bos_token_id: int = 151849, - eos_token_id: int = 151850, - rope_parameters: RopeParameters | None = None, - mlp_bias=False, - attention_bias=False, - attention_dropout: float = 0.1, - initializer_range: float = 0.02, - tie_word_embeddings: bool | None = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.mlp_bias = mlp_bias - self.attention_bias = attention_bias - self.initializer_range = initializer_range - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vocab_size: int = 184622 + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 9216 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int = 151643 + bos_token_id: int = 151849 + eos_token_id: int | list[int] | None = 151850 + rope_parameters: RopeParameters | dict | None = None + mlp_bias = False + attention_bias = False + attention_dropout: float | int = 0.1 + initializer_range: float = 0.02 + tie_word_embeddings: bool = False @auto_docstring(checkpoint="Emu3-community/Emu3-Chat-hf") +@strict(accept_kwargs=True) class Emu3Config(PreTrainedConfig): r""" vocabulary_map (`dict`, *optional*): @@ -169,31 +129,24 @@ class Emu3Config(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] sub_configs = {"text_config": Emu3TextConfig, "vq_config": Emu3VQVAEConfig} - def __init__( - self, - vq_config: dict | Emu3VQVAEConfig = None, - text_config: dict | Emu3TextConfig = None, - vocabulary_map: dict[int, int] | None = None, - tie_word_embeddings: bool | None = False, - **kwargs, - ): - if vq_config is None: - vq_config = Emu3VQVAEConfig() - elif isinstance(vq_config, dict): - vq_config = Emu3VQVAEConfig(**vq_config) - - if text_config is None: - text_config = Emu3TextConfig() - elif isinstance(text_config, dict): - text_config = Emu3TextConfig(**text_config) - - self.vq_config = vq_config - self.text_config = text_config - self.vocabulary_map = vocabulary_map - self.image_token_id = vocabulary_map.get("") if vocabulary_map is not None else None - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + vq_config: dict | Emu3VQVAEConfig | None = None + text_config: dict | Emu3TextConfig | None = None + vocabulary_map: dict[str, int] | None = None + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if self.vq_config is None: + self.vq_config = Emu3VQVAEConfig() + elif isinstance(self.vq_config, dict): + self.vq_config = Emu3VQVAEConfig(**self.vq_config) + + if self.text_config is None: + self.text_config = Emu3TextConfig() + elif isinstance(self.text_config, dict): + self.text_config = Emu3TextConfig(**self.text_config) + + self.image_token_id = self.vocabulary_map.get("") if self.vocabulary_map is not None else None + super().__post_init__(**kwargs) __all__ = ["Emu3Config", "Emu3TextConfig", "Emu3VQVAEConfig"] diff --git a/src/transformers/models/encodec/configuration_encodec.py b/src/transformers/models/encodec/configuration_encodec.py index 9c5c0a10093d..8b551f50a931 100644 --- a/src/transformers/models/encodec/configuration_encodec.py +++ b/src/transformers/models/encodec/configuration_encodec.py @@ -16,15 +16,14 @@ import math import numpy as np +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/encodec_24khz") +@strict(accept_kwargs=True) class EncodecConfig(PreTrainedConfig): r""" chunk_length_s (`float`, *optional*): @@ -85,64 +84,41 @@ class EncodecConfig(PreTrainedConfig): model_type = "encodec" - def __init__( - self, - target_bandwidths=[1.5, 3.0, 6.0, 12.0, 24.0], - sampling_rate=24_000, - audio_channels=1, - normalize=False, - chunk_length_s=None, - overlap=None, - hidden_size=128, - num_filters=32, - num_residual_layers=1, - upsampling_ratios=[8, 5, 4, 2], - norm_type="weight_norm", - kernel_size=7, - last_kernel_size=7, - residual_kernel_size=3, - dilation_growth_rate=2, - use_causal_conv=True, - pad_mode="reflect", - compress=2, - num_lstm_layers=2, - trim_right_ratio=1.0, - codebook_size=1024, - codebook_dim=None, - use_conv_shortcut=True, - **kwargs, - ): - self.target_bandwidths = target_bandwidths - self.sampling_rate = sampling_rate - self.audio_channels = audio_channels - self.normalize = normalize - self.chunk_length_s = chunk_length_s - self.overlap = overlap - self.hidden_size = hidden_size - self.num_filters = num_filters - self.num_residual_layers = num_residual_layers - self.upsampling_ratios = upsampling_ratios - self.norm_type = norm_type - self.kernel_size = kernel_size - self.last_kernel_size = last_kernel_size - self.residual_kernel_size = residual_kernel_size - self.dilation_growth_rate = dilation_growth_rate - self.use_causal_conv = use_causal_conv - self.pad_mode = pad_mode - self.compress = compress - self.num_lstm_layers = num_lstm_layers - self.trim_right_ratio = trim_right_ratio - self.codebook_size = codebook_size - self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size - self.use_conv_shortcut = use_conv_shortcut - + target_bandwidths: list[float] | tuple[float, ...] = (1.5, 3.0, 6.0, 12.0, 24.0) + sampling_rate: int = 24_000 + audio_channels: int = 1 + normalize: bool = False + chunk_length_s: int | float | None = None + overlap: float | None = None + hidden_size: int = 128 + num_filters: int = 32 + num_residual_layers: int = 1 + upsampling_ratios: list[int] | tuple[int, ...] = (8, 5, 4, 2) + norm_type: str = "weight_norm" + kernel_size: int = 7 + last_kernel_size: int = 7 + residual_kernel_size: int = 3 + dilation_growth_rate: int = 2 + use_causal_conv: bool = True + pad_mode: str = "reflect" + compress: int = 2 + num_lstm_layers: int = 2 + trim_right_ratio: float = 1.0 + codebook_size: int = 1024 + codebook_dim: int | None = None + use_conv_shortcut: bool = True + + def __post_init__(self, **kwargs): + self.codebook_dim = self.codebook_dim if self.codebook_dim is not None else self.hidden_size + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if self.norm_type not in ["weight_norm", "time_group_norm"]: raise ValueError( f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}' ) - super().__init__(**kwargs) - # This is a property because you might want to change the chunk_length_s on the fly @property def chunk_length(self) -> int | None: diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py index dc4e58bcd631..6eb84471212a 100644 --- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py @@ -14,6 +14,8 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto import AutoConfig @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="") +@strict(accept_kwargs=True) class EncoderDecoderConfig(PreTrainedConfig): r""" Examples: @@ -58,18 +61,17 @@ class EncoderDecoderConfig(PreTrainedConfig): sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} has_no_defaults_at_init = True - def __init__( - self, - pad_token_id=None, - decoder_start_token_id=None, - **kwargs, - ): - super().__init__(**kwargs) + pad_token_id: int | None = None + decoder_start_token_id: int | None = None + is_encoder_decoder: int | None = True + + def __post_init__(self, **kwargs): if "encoder" not in kwargs or "decoder" not in kwargs: raise ValueError( - f"A configuration of type {self.model_type} cannot be instantiated because " - f"both `encoder` and `decoder` sub-configurations were not passed, only {kwargs}" + f"A configuration of type {self.model_type} cannot be instantiated because not both `encoder` and" + f" `decoder` sub-configurations are passed, but only {kwargs}" ) + encoder_config = kwargs.pop("encoder") encoder_model_type = encoder_config.pop("model_type") decoder_config = kwargs.pop("decoder") @@ -77,9 +79,7 @@ def __init__( self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config) self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config) - self.is_encoder_decoder = True - self.pad_token_id = pad_token_id - self.decoder_start_token_id = decoder_start_token_id + super().__post_init__(**kwargs) @classmethod def from_encoder_decoder_configs( diff --git a/src/transformers/models/eomt/configuration_eomt.py b/src/transformers/models/eomt/configuration_eomt.py index 2225b4b77bb9..e7adcf5fa2ad 100644 --- a/src/transformers/models/eomt/configuration_eomt.py +++ b/src/transformers/models/eomt/configuration_eomt.py @@ -17,11 +17,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="tue-mps/coco_panoptic_eomt_large_640") +@strict(accept_kwargs=True) class EomtConfig(PreTrainedConfig): r""" num_upscale_blocks (`int`, *optional*, defaults to 2): @@ -66,64 +69,32 @@ class EomtConfig(PreTrainedConfig): model_type = "eomt" - def __init__( - self, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - mlp_ratio=4, - hidden_act="gelu", - hidden_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=640, - patch_size=16, - num_channels=3, - layerscale_value=1.0, - drop_path_rate=0.0, - num_upscale_blocks=2, - attention_dropout=0.0, - use_swiglu_ffn=False, - num_blocks=4, - no_object_weight: float = 0.1, - class_weight: float = 2.0, - mask_weight: float = 5.0, - dice_weight: float = 5.0, - train_num_points: int = 12544, - oversample_ratio: float = 3.0, - importance_sample_ratio: float = 0.75, - num_queries=200, - num_register_tokens=4, - **kwargs, - ): - self.mlp_ratio = mlp_ratio - self.attention_dropout = attention_dropout - self.layerscale_value = layerscale_value - self.drop_path_rate = drop_path_rate - self.num_upscale_blocks = num_upscale_blocks - self.use_swiglu_ffn = use_swiglu_ffn - self.num_blocks = num_blocks - self.no_object_weight = no_object_weight - self.class_weight = class_weight - self.mask_weight = mask_weight - self.dice_weight = dice_weight - self.train_num_points = train_num_points - self.oversample_ratio = oversample_ratio - self.importance_sample_ratio = importance_sample_ratio - self.num_queries = num_queries - self.num_register_tokens = num_register_tokens - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - - super().__init__(**kwargs) + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + image_size: int | list[int] | tuple[int, int] = 640 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + mlp_ratio: int = 4 + layerscale_value: float = 1.0 + drop_path_rate: float = 0.0 + num_upscale_blocks: int = 2 + attention_dropout: float | int = 0.0 + use_swiglu_ffn: bool = False + num_blocks: int = 4 + no_object_weight: float = 0.1 + class_weight: float = 2.0 + mask_weight: float = 5.0 + dice_weight: float = 5.0 + train_num_points: int = 12544 + oversample_ratio: float = 3.0 + importance_sample_ratio: float = 0.75 + num_queries: int = 200 + num_register_tokens: int = 4 __all__ = ["EomtConfig"] diff --git a/src/transformers/models/eomt/modular_eomt.py b/src/transformers/models/eomt/modular_eomt.py index 730927cb0b4b..5d9714c822be 100644 --- a/src/transformers/models/eomt/modular_eomt.py +++ b/src/transformers/models/eomt/modular_eomt.py @@ -18,6 +18,7 @@ import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import Tensor, nn from ... import initialization as init @@ -49,6 +50,7 @@ @auto_docstring(checkpoint="tue-mps/coco_panoptic_eomt_large_640") +@strict(accept_kwargs=True) class EomtConfig(ViTConfig): r""" num_upscale_blocks (`int`, *optional*, defaults to 2): @@ -93,73 +95,42 @@ class EomtConfig(ViTConfig): model_type = "eomt" - def __init__( - self, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - mlp_ratio=4, - hidden_act="gelu", - hidden_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=640, - patch_size=16, - num_channels=3, - layerscale_value=1.0, - drop_path_rate=0.0, - num_upscale_blocks=2, - attention_dropout=0.0, - use_swiglu_ffn=False, - num_blocks=4, - no_object_weight: float = 0.1, - class_weight: float = 2.0, - mask_weight: float = 5.0, - dice_weight: float = 5.0, - train_num_points: int = 12544, - oversample_ratio: float = 3.0, - importance_sample_ratio: float = 0.75, - num_queries=200, - num_register_tokens=4, - **kwargs, - ): - self.mlp_ratio = mlp_ratio - self.attention_dropout = attention_dropout - self.layerscale_value = layerscale_value - self.drop_path_rate = drop_path_rate - self.num_upscale_blocks = num_upscale_blocks - self.use_swiglu_ffn = use_swiglu_ffn - self.num_blocks = num_blocks - self.no_object_weight = no_object_weight - self.class_weight = class_weight - self.mask_weight = mask_weight - self.dice_weight = dice_weight - self.train_num_points = train_num_points - self.oversample_ratio = oversample_ratio - self.importance_sample_ratio = importance_sample_ratio - self.num_queries = num_queries - self.num_register_tokens = num_register_tokens - - super().__init__( - hidden_size=hidden_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - hidden_dropout_prob=hidden_dropout_prob, - hidden_act=hidden_act, - initializer_range=initializer_range, - layer_norm_eps=layer_norm_eps, - image_size=image_size, - patch_size=patch_size, - num_channels=num_channels, - **kwargs, - ) - - del self.intermediate_size - del self.qkv_bias - del self.pooler_act - del self.pooler_output_size - del self.encoder_stride - del self.attention_probs_dropout_prob + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + mlp_ratio: int = 4 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + image_size: int | list[int] | tuple[int, int] = 640 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + layerscale_value: float = 1.0 + drop_path_rate: float = 0.0 + num_upscale_blocks: int = 2 + attention_dropout: float | int = 0.0 + use_swiglu_ffn: bool = False + num_blocks: int = 4 + no_object_weight: float = 0.1 + class_weight: float = 2.0 + mask_weight: float = 5.0 + dice_weight: float = 5.0 + train_num_points: int = 12544 + oversample_ratio: float = 3.0 + importance_sample_ratio: float = 0.75 + num_queries: int = 200 + num_register_tokens: int = 4 + + intermediate_size = AttributeError() + qkv_bias = AttributeError() + pooler_act = AttributeError() + pooler_output_size = AttributeError() + encoder_stride = AttributeError() + attention_probs_dropout_prob = AttributeError() + + def __post_init__(self, **kwargs): + raise AttributeError("Not needed for Eomt") @dataclass diff --git a/src/transformers/models/eomt_dinov3/configuration_eomt_dinov3.py b/src/transformers/models/eomt_dinov3/configuration_eomt_dinov3.py index a8b656ff9cfd..9f48577e5d06 100644 --- a/src/transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +++ b/src/transformers/models/eomt_dinov3/configuration_eomt_dinov3.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="tue-mps/coco_panoptic_eomt_large_640_dinov3") +@strict(accept_kwargs=True) class EomtDinov3Config(PreTrainedConfig): r""" layerscale_value (`float`, *optional*, defaults to 1.0): @@ -62,98 +65,43 @@ class EomtDinov3Config(PreTrainedConfig): """ model_type = "eomt_dinov3" - default_theta = 100.0 - def __init__( - self, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - intermediate_size=4096, - hidden_act="gelu", - hidden_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=640, - patch_size=16, - num_channels=3, - layerscale_value=1.0, - drop_path_rate=0.0, - num_upscale_blocks=2, - attention_dropout=0.0, - num_blocks=4, - no_object_weight: float = 0.1, - class_weight: float = 2.0, - mask_weight: float = 5.0, - dice_weight: float = 5.0, - train_num_points: int = 12544, - oversample_ratio: float = 3.0, - importance_sample_ratio: float = 0.75, - num_queries=200, - num_register_tokens=4, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - query_bias: bool = True, - key_bias: bool = False, - value_bias: bool = True, - proj_bias: bool = True, - mlp_bias: bool = True, - use_gated_mlp: bool = False, - pos_embed_shift: float | None = None, - pos_embed_jitter: float | None = None, - pos_embed_rescale: float | None = 2.0, - **kwargs, - ): - self.intermediate_size = intermediate_size - self.attention_dropout = attention_dropout - self.layerscale_value = layerscale_value - self.drop_path_rate = drop_path_rate - self.num_upscale_blocks = num_upscale_blocks - self.num_blocks = num_blocks - self.no_object_weight = no_object_weight - self.class_weight = class_weight - self.mask_weight = mask_weight - self.dice_weight = dice_weight - self.train_num_points = train_num_points - self.oversample_ratio = oversample_ratio - self.importance_sample_ratio = importance_sample_ratio - self.num_queries = num_queries - self.num_register_tokens = num_register_tokens - self.rope_parameters = rope_parameters - self.query_bias = query_bias - self.key_bias = key_bias - self.value_bias = value_bias - self.proj_bias = proj_bias - self.mlp_bias = mlp_bias - self.use_gated_mlp = use_gated_mlp - self.pos_embed_shift = pos_embed_shift - self.pos_embed_jitter = pos_embed_jitter - self.pos_embed_rescale = pos_embed_rescale - self.attention_dropout = attention_dropout - self.layerscale_value = layerscale_value - self.drop_path_rate = drop_path_rate - self.num_upscale_blocks = num_upscale_blocks - self.num_blocks = num_blocks - self.no_object_weight = no_object_weight - self.class_weight = class_weight - self.mask_weight = mask_weight - self.dice_weight = dice_weight - self.train_num_points = train_num_points - self.oversample_ratio = oversample_ratio - self.importance_sample_ratio = importance_sample_ratio - self.num_queries = num_queries - self.num_register_tokens = num_register_tokens - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - - super().__init__(**kwargs) + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + image_size: int | list[int] | tuple[int, int] = 640 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + layerscale_value: float = 1.0 + drop_path_rate: float = 0.0 + num_upscale_blocks: int = 2 + attention_dropout: float | int = 0.0 + num_blocks: int = 4 + no_object_weight: float = 0.1 + class_weight: float = 2.0 + mask_weight: float = 5.0 + dice_weight: float = 5.0 + train_num_points: int = 12544 + oversample_ratio: float = 3.0 + importance_sample_ratio: float = 0.75 + num_queries: int = 200 + num_register_tokens: int = 4 + default_theta = 100.0 + intermediate_size: int = 4096 + rope_parameters: RopeParameters | dict | None = None + query_bias: bool = True + key_bias: bool = False + value_bias: bool = True + proj_bias: bool = True + mlp_bias: bool = True + use_gated_mlp: bool = False + pos_embed_shift: float | None = None + pos_embed_jitter: float | None = None + pos_embed_rescale: float | None = 2.0 __all__ = ["EomtDinov3Config"] diff --git a/src/transformers/models/eomt_dinov3/modular_eomt_dinov3.py b/src/transformers/models/eomt_dinov3/modular_eomt_dinov3.py index c091ef69f857..81ee67057b54 100644 --- a/src/transformers/models/eomt_dinov3/modular_eomt_dinov3.py +++ b/src/transformers/models/eomt_dinov3/modular_eomt_dinov3.py @@ -18,6 +18,7 @@ import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import Tensor, nn from ... import initialization as init @@ -47,6 +48,7 @@ @auto_docstring(checkpoint="tue-mps/coco_panoptic_eomt_large_640_dinov3") +@strict(accept_kwargs=True) class EomtDinov3Config(EomtConfig): r""" layerscale_value (`float`, *optional*, defaults to 1.0): @@ -88,92 +90,44 @@ class EomtDinov3Config(EomtConfig): model_type = "eomt_dinov3" default_theta = 100.0 - def __init__( - self, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - intermediate_size=4096, - hidden_act="gelu", - hidden_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=640, - patch_size=16, - num_channels=3, - layerscale_value=1.0, - drop_path_rate=0.0, - num_upscale_blocks=2, - attention_dropout=0.0, - num_blocks=4, - no_object_weight: float = 0.1, - class_weight: float = 2.0, - mask_weight: float = 5.0, - dice_weight: float = 5.0, - train_num_points: int = 12544, - oversample_ratio: float = 3.0, - importance_sample_ratio: float = 0.75, - num_queries=200, - num_register_tokens=4, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - query_bias: bool = True, - key_bias: bool = False, - value_bias: bool = True, - proj_bias: bool = True, - mlp_bias: bool = True, - use_gated_mlp: bool = False, - pos_embed_shift: float | None = None, - pos_embed_jitter: float | None = None, - pos_embed_rescale: float | None = 2.0, - **kwargs, - ): - self.intermediate_size = intermediate_size - self.attention_dropout = attention_dropout - self.layerscale_value = layerscale_value - self.drop_path_rate = drop_path_rate - self.num_upscale_blocks = num_upscale_blocks - self.num_blocks = num_blocks - self.no_object_weight = no_object_weight - self.class_weight = class_weight - self.mask_weight = mask_weight - self.dice_weight = dice_weight - self.train_num_points = train_num_points - self.oversample_ratio = oversample_ratio - self.importance_sample_ratio = importance_sample_ratio - self.num_queries = num_queries - self.num_register_tokens = num_register_tokens - self.rope_parameters = rope_parameters - self.query_bias = query_bias - self.key_bias = key_bias - self.value_bias = value_bias - self.proj_bias = proj_bias - self.mlp_bias = mlp_bias - self.use_gated_mlp = use_gated_mlp - self.pos_embed_shift = pos_embed_shift - self.pos_embed_jitter = pos_embed_jitter - self.pos_embed_rescale = pos_embed_rescale - - super().__init__( - hidden_size=hidden_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - hidden_dropout_prob=hidden_dropout_prob, - hidden_act=hidden_act, - initializer_range=initializer_range, - layer_norm_eps=layer_norm_eps, - image_size=image_size, - patch_size=patch_size, - num_channels=num_channels, - **kwargs, - ) - - del self.qkv_bias - del self.pooler_act - del self.pooler_output_size - del self.encoder_stride - del self.attention_probs_dropout_prob - del self.mlp_ratio - del self.use_swiglu_ffn + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + intermediate_size: int = 4096 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + image_size: int | list[int] | tuple[int, int] = 640 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + layerscale_value: float = 1.0 + drop_path_rate: float = 0.0 + num_upscale_blocks: int = 2 + attention_dropout: float | int = 0.0 + num_blocks: int = 4 + no_object_weight: float = 0.1 + class_weight: float = 2.0 + mask_weight: float = 5.0 + dice_weight: float = 5.0 + train_num_points: int = 12544 + oversample_ratio: float = 3.0 + importance_sample_ratio: float = 0.75 + num_queries: int = 200 + num_register_tokens: int = 4 + rope_parameters: RopeParameters | dict | None = None + query_bias: bool = True + key_bias: bool = False + value_bias: bool = True + proj_bias: bool = True + mlp_bias: bool = True + use_gated_mlp: bool = False + pos_embed_shift: float | None = None + pos_embed_jitter: float | None = None + pos_embed_rescale: float | None = 2.0 + + mlp_ratio = AttributeError() + use_swiglu_ffn = AttributeError() class EomtDinov3Attention(DINOv3ViTAttention): diff --git a/src/transformers/models/ernie/configuration_ernie.py b/src/transformers/models/ernie/configuration_ernie.py index 60ff10b785e0..e3a52aa47d05 100644 --- a/src/transformers/models/ernie/configuration_ernie.py +++ b/src/transformers/models/ernie/configuration_ernie.py @@ -14,14 +14,14 @@ # limitations under the License. """ERNIE model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="nghuyong/ernie-3.0-base-zh") +@strict(accept_kwargs=True) class ErnieConfig(PreTrainedConfig): r""" task_type_vocab_size (`int`, *optional*, defaults to 3): @@ -46,56 +46,28 @@ class ErnieConfig(PreTrainedConfig): model_type = "ernie" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - task_type_vocab_size=3, - use_task_id=False, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - use_cache=True, - classifier_dropout=None, - is_decoder=False, - add_cross_attention=False, - bos_token_id=None, - eos_token_id=None, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.task_type_vocab_size = task_type_vocab_size - self.use_task_id = use_task_id - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + task_type_vocab_size: int = 3 + use_task_id: bool = False + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + use_cache: bool = True + classifier_dropout: float | int | None = None + is_decoder: bool = False + add_cross_attention: bool = False + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = True __all__ = ["ErnieConfig"] diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py index 95b6fc19f50a..039be793586b 100644 --- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py +++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -13,12 +13,15 @@ # limitations under the License. """Ernie 4.5 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="baidu/ERNIE-4.5-0.3B-PT") +@strict(accept_kwargs=True) class Ernie4_5Config(PreTrainedConfig): r""" use_bias (`bool`, *optional*, defaults to `False`): @@ -58,53 +61,31 @@ class Ernie4_5Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 103424, - hidden_size: int | None = 1024, - intermediate_size: int | None = 3072, - num_hidden_layers: int | None = 18, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 2, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-05, - use_cache: int | None = True, - pad_token_id: int | None = 0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - use_bias: bool | None = False, - head_dim: int | None = 128, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + vocab_size: int = 103424 + hidden_size: int = 1024 + intermediate_size: int = 3072 + num_hidden_layers: int = 18 + num_attention_heads: int = 16 + num_key_value_heads: int | None = 2 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + use_cache: int | None = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + use_bias: bool | None = False + head_dim: int | None = 128 - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.use_bias = use_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["Ernie4_5Config"] diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index 1ae3ba58fca9..f55c61c4ecf0 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -13,15 +13,15 @@ # limitations under the License. """Ernie 4.5 MoE model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="baidu/ERNIE-4.5-21B-A3B-PT") +@strict(accept_kwargs=True) class Ernie4_5_MoeConfig(PreTrainedConfig): r""" use_bias (`bool`, *optional*, defaults to `False`): @@ -83,68 +83,39 @@ class Ernie4_5_MoeConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 103424, - pad_token_id: int | None = 0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - hidden_size: int | None = 2560, - intermediate_size: int | None = 12288, - num_hidden_layers: int | None = 28, - num_attention_heads: int | None = 20, - num_key_value_heads: int | None = 4, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - use_bias: int | None = False, - moe_intermediate_size: int | None = 1536, - moe_k: int | None = 6, - moe_num_experts: int | None = 64, - moe_num_shared_experts: int | None = 2, - moe_layer_start_index: int | None = 1, - moe_layer_end_index: int | None = -1, - moe_layer_interval: int | None = 1, - moe_norm_min: int | None = 1e-12, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.use_bias = use_bias - - # MoE arguments - self.moe_intermediate_size = moe_intermediate_size - self.moe_k = moe_k - self.moe_num_experts = moe_num_experts - self.moe_num_shared_experts = moe_num_shared_experts - self.moe_layer_start_index = moe_layer_start_index - self.moe_layer_end_index = self.num_hidden_layers - 1 if moe_layer_end_index == -1 else moe_layer_end_index - self.moe_layer_interval = moe_layer_interval - self.moe_norm_min = moe_norm_min - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 103424 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + hidden_size: int = 2560 + intermediate_size: int = 12288 + num_hidden_layers: int = 28 + num_attention_heads: int = 20 + num_key_value_heads: int | None = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + use_bias: int | None = False + moe_intermediate_size: int = 1536 + moe_k: int | None = 6 + moe_num_experts: int | None = 64 + moe_num_shared_experts: int | None = 2 + moe_layer_start_index: int | None = 1 + moe_layer_end_index: int | None = -1 + moe_layer_interval: int | None = 1 + moe_norm_min: float | None = 1e-12 + output_router_logits: bool | None = False + router_aux_loss_coef: float | None = 0.001 + + def __post_init__(self, **kwargs): + self.moe_layer_end_index = ( + self.num_hidden_layers - 1 if self.moe_layer_end_index == -1 else self.moe_layer_end_index + ) + super().__post_init__(**kwargs) __all__ = ["Ernie4_5_MoeConfig"] diff --git a/src/transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py index 4134a059731f..14decd7872e1 100644 --- a/src/transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py @@ -17,7 +17,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -25,6 +28,7 @@ @auto_docstring(checkpoint="baidu/ERNIE-4.5-VL-28B-A3B-PT") +@strict(accept_kwargs=True) class Ernie4_5_VLMoeVisionConfig(PreTrainedConfig): r""" temporal_merge_size (`int`, *optional*, defaults to 2): @@ -34,45 +38,29 @@ class Ernie4_5_VLMoeVisionConfig(PreTrainedConfig): model_type = "ernie4_5_vl_moe_vision" base_config_key = "vision_config" + depth: int = 32 + + hidden_size: int = 1280 + hidden_act: str = "quick_gelu" + num_heads: int = 16 + in_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 14 + spatial_merge_size: int = 2 + initializer_range: float = 0.02 + base_model_tp_plan = { "blocks.*.attn.qkv": "colwise", "blocks.*.attn.proj": "rowwise", "blocks.*.mlp.fc1": "colwise", "blocks.*.mlp.fc2": "rowwise", } - - def __init__( - self, - depth=32, - hidden_size=1280, - hidden_act="quick_gelu", - intermediate_size=4 * 1280, - num_heads=16, - in_channels=3, - patch_size=14, - spatial_merge_size=2, - temporal_merge_size=2, - rms_norm_eps=1e-6, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.initializer_range = initializer_range - - self.intermediate_size = intermediate_size - self.temporal_merge_size = temporal_merge_size - self.rms_norm_eps = rms_norm_eps + intermediate_size: int = 4 * 1280 + temporal_merge_size: int = 2 + rms_norm_eps: float = 1e-6 @auto_docstring(checkpoint="baidu/ERNIE-4.5-VL-28B-A3B-PT") +@strict(accept_kwargs=True) class Ernie4_5_VLMoeTextConfig(PreTrainedConfig): r""" use_bias (`bool`, *optional*, defaults to `False`): @@ -111,73 +99,48 @@ class Ernie4_5_VLMoeTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + + vocab_size: int = 103424 + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + hidden_size: int = 2560 + intermediate_size: int = 12288 + num_hidden_layers: int = 28 + num_attention_heads: int = 20 + num_key_value_heads: int | None = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + use_bias: int | None = False + moe_intermediate_size: list[int] | None = None + moe_k: int | None = 6 + moe_num_experts: int | None = 64 + moe_num_shared_experts: int | None = 2 + moe_norm_min: float | None = 1e-12 + output_router_logits: bool | None = False + router_aux_loss_coef: float | None = 0.001 base_config_key = "text_config" + ignore_keys_at_rope_validation = {"mrope_section"} + + mlp_layer_types: list[str] | None = None - def __init__( - self, - vocab_size=103424, - hidden_size=2560, - intermediate_size=12288, - num_hidden_layers=28, - num_attention_heads=20, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=131072, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - use_bias=False, - rope_parameters=None, - mlp_layer_types=None, - moe_intermediate_size=None, - moe_k=6, - moe_num_experts=64, - moe_num_shared_experts=2, - moe_norm_min=1e-12, - output_router_logits=False, - router_aux_loss_coef=0.001, - pad_token_id=None, - eos_token_id=None, - bos_token_id=None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.use_bias = use_bias - self.rope_parameters = rope_parameters - - # Default to MoE from the second layer and on - self.mlp_layer_types = mlp_layer_types + def __post_init__(self, **kwargs): if self.mlp_layer_types is None: self.mlp_layer_types = ["dense"] + ["sparse"] * (self.num_hidden_layers - 1) - layer_type_validation(self.mlp_layer_types, self.num_hidden_layers, attention=False) - self.moe_intermediate_size = moe_intermediate_size if self.moe_intermediate_size is None: self.moe_intermediate_size = [1536, 512] - self.moe_k = moe_k - self.moe_num_experts = moe_num_experts - self.moe_num_shared_experts = moe_num_shared_experts - self.moe_norm_min = moe_norm_min - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.bos_token_id = bos_token_id - super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="baidu/ERNIE-4.5-VL-28B-A3B-PT") +@strict(accept_kwargs=True) class Ernie4_5_VLMoeConfig(PreTrainedConfig): r""" image_start_token_id (`int`, *optional*, defaults to 101304): @@ -212,42 +175,28 @@ class Ernie4_5_VLMoeConfig(PreTrainedConfig): sub_configs = {"vision_config": Ernie4_5_VLMoeVisionConfig, "text_config": Ernie4_5_VLMoeTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_start_token_id=101304, - image_end_token_id=101305, - image_token_id=100295, - video_start_token_id=101306, - video_end_token_id=101307, - video_token_id=103367, - tie_word_embeddings=True, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif isinstance(vision_config, Ernie4_5_VLMoeVisionConfig): - self.vision_config = vision_config - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_start_token_id: int = 101304 + image_end_token_id: int = 101305 + image_token_id: int = 100295 + video_start_token_id: int = 101306 + video_end_token_id: int = 101307 + video_token_id: int = 103367 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif isinstance(text_config, Ernie4_5_VLMoeTextConfig): - self.text_config = text_config - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: self.text_config = self.sub_configs["text_config"](**kwargs) - self.image_start_token_id = image_start_token_id - self.image_end_token_id = image_end_token_id - self.image_token_id = image_token_id - self.video_start_token_id = video_start_token_id - self.video_end_token_id = video_end_token_id - self.video_token_id = video_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) class Ernie4_5_VL_MoeConfig(Ernie4_5_VLMoeConfig): diff --git a/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py index 177db68bce68..17f40cf5ae30 100644 --- a/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py @@ -830,7 +830,7 @@ def forward(self, x) -> torch.Tensor: class Ernie4_5_VLMoePatchEmbed(nn.Module): def __init__( self, - patch_size: int = 14, + patch_size: int | list[int] | tuple[int, int] = 14, in_channels: int = 3, embed_dim: int = 1152, ) -> None: diff --git a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py index 88c2c93fca11..935f9f968004 100644 --- a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py @@ -21,10 +21,11 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...generation import GenerationMixin from ...image_processing_utils import BaseImageProcessor, BatchFeature from ...image_processing_utils_fast import ( @@ -91,6 +92,7 @@ @auto_docstring(checkpoint="baidu/ERNIE-4.5-VL-28B-A3B-PT") +@strict(accept_kwargs=True) class Ernie4_5_VLMoeVisionConfig(Qwen2VLVisionConfig): r""" temporal_merge_size (`int`, *optional*, defaults to 2): @@ -106,47 +108,19 @@ class Ernie4_5_VLMoeVisionConfig(Qwen2VLVisionConfig): "blocks.*.mlp.fc2": "rowwise", } - def __init__( - self, - depth=32, - hidden_size=1280, - hidden_act="quick_gelu", - intermediate_size=4 * 1280, - num_heads=16, - in_channels=3, - patch_size=14, - spatial_merge_size=2, - temporal_merge_size=2, - rms_norm_eps=1e-6, - initializer_range=0.02, - **kwargs, - ): - super().__init__( - depth=depth, - hidden_size=hidden_size, - hidden_act=hidden_act, - intermediate_size=intermediate_size, - num_heads=num_heads, - in_channels=in_channels, - patch_size=patch_size, - spatial_merge_size=spatial_merge_size, - temporal_merge_size=temporal_merge_size, - rms_norm_eps=rms_norm_eps, - initializer_range=initializer_range, - **kwargs, - ) - - del self.embed_dim # noqa: F821 - del self.mlp_ratio # noqa: F821 - del self.temporal_patch_size # noqa: F821 + hidden_size: int = 1280 + intermediate_size: int = 4 * 1280 + temporal_merge_size: int = 2 + rms_norm_eps: float = 1e-6 - self.intermediate_size = intermediate_size - self.temporal_merge_size = temporal_merge_size - self.rms_norm_eps = rms_norm_eps + embed_dim = AttributeError() + mlp_ratio = AttributeError() + temporal_patch_size = AttributeError() @auto_docstring(checkpoint="baidu/ERNIE-4.5-VL-28B-A3B-PT") -class Ernie4_5_VLMoeTextConfig(Ernie4_5_MoeConfig, PreTrainedConfig): +@strict(accept_kwargs=True) +class Ernie4_5_VLMoeTextConfig(Ernie4_5_MoeConfig): r""" use_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in any of the projections including mlp and attention for example @@ -177,72 +151,29 @@ class Ernie4_5_VLMoeTextConfig(Ernie4_5_MoeConfig, PreTrainedConfig): "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", } - - def __init__( - self, - vocab_size=103424, - hidden_size=2560, - intermediate_size=12288, - num_hidden_layers=28, - num_attention_heads=20, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=131072, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - use_bias=False, - rope_parameters=None, - mlp_layer_types=None, - moe_intermediate_size=None, - moe_k=6, - moe_num_experts=64, - moe_num_shared_experts=2, - moe_norm_min=1e-12, - output_router_logits=False, - router_aux_loss_coef=0.001, - pad_token_id=None, - eos_token_id=None, - bos_token_id=None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.use_bias = use_bias - self.rope_parameters = rope_parameters - - # Default to MoE from the second layer and on - self.mlp_layer_types = mlp_layer_types + ignore_keys_at_rope_validation = {"mrope_section"} + + mlp_layer_types: list[str] | None = None + moe_intermediate_size: list[int] | None = None + pad_token_id: int | None = None + eos_token_id: int | list[int] | None = None + bos_token_id: int | None = None + moe_layer_end_index = AttributeError() + moe_layer_interval = AttributeError() + moe_layer_start_index = AttributeError() + + def __post_init__(self, **kwargs): if self.mlp_layer_types is None: self.mlp_layer_types = ["dense"] + ["sparse"] * (self.num_hidden_layers - 1) - layer_type_validation(self.mlp_layer_types, self.num_hidden_layers, attention=False) - self.moe_intermediate_size = moe_intermediate_size if self.moe_intermediate_size is None: self.moe_intermediate_size = [1536, 512] - self.moe_k = moe_k - self.moe_num_experts = moe_num_experts - self.moe_num_shared_experts = moe_num_shared_experts - self.moe_norm_min = moe_norm_min - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.bos_token_id = bos_token_id - PreTrainedConfig.__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs) + PreTrainedConfig.__post_init__(**kwargs) @auto_docstring(checkpoint="baidu/ERNIE-4.5-VL-28B-A3B-PT") +@strict(accept_kwargs=True) class Ernie4_5_VLMoeConfig(PreTrainedConfig): r""" image_start_token_id (`int`, *optional*, defaults to 101304): @@ -277,42 +208,28 @@ class Ernie4_5_VLMoeConfig(PreTrainedConfig): sub_configs = {"vision_config": Ernie4_5_VLMoeVisionConfig, "text_config": Ernie4_5_VLMoeTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_start_token_id=101304, - image_end_token_id=101305, - image_token_id=100295, - video_start_token_id=101306, - video_end_token_id=101307, - video_token_id=103367, - tie_word_embeddings=True, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif isinstance(vision_config, Ernie4_5_VLMoeVisionConfig): - self.vision_config = vision_config - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_start_token_id: int = 101304 + image_end_token_id: int = 101305 + image_token_id: int = 100295 + video_start_token_id: int = 101306 + video_end_token_id: int = 101307 + video_token_id: int = 103367 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif isinstance(text_config, Ernie4_5_VLMoeTextConfig): - self.text_config = text_config - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: self.text_config = self.sub_configs["text_config"](**kwargs) - self.image_start_token_id = image_start_token_id - self.image_end_token_id = image_end_token_id - self.image_token_id = image_token_id - self.video_start_token_id = video_start_token_id - self.video_end_token_id = video_end_token_id - self.video_token_id = video_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) class Ernie4_5_VLMoeTextRotaryEmbedding(nn.Module): @@ -740,7 +657,7 @@ class Ernie4_5VLVisionMLP(VisionMlp): class Ernie4_5_VLMoePatchEmbed(Qwen2_5_VisionPatchEmbed): def __init__( self, - patch_size: int = 14, + patch_size: int | list[int] | tuple[int, int] = 14, in_channels: int = 3, embed_dim: int = 1152, ) -> None: diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py index e310a187ff5e..b3b1c76b9f9e 100644 --- a/src/transformers/models/esm/configuration_esm.py +++ b/src/transformers/models/esm/configuration_esm.py @@ -13,17 +13,20 @@ # limitations under the License. """ESM model configuration""" -from dataclasses import asdict, dataclass +from typing import Union + +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging +from ...utils.type_validators import interval, is_divisible_by logger = logging.get_logger(__name__) -@dataclass -class StructureModuleConfig: +@strict(accept_kwargs=True) +class StructureModuleConfig(PreTrainedConfig): """ Args: sequence_dim: @@ -58,49 +61,58 @@ class StructureModuleConfig: Large number used for attention masking """ - sequence_dim: int = 384 - pairwise_dim: int = 128 - ipa_dim: int = 16 - resnet_dim: int = 128 - num_heads_ipa: int = 12 - num_qk_points: int = 4 - num_v_points: int = 8 - dropout_rate: float = 0.1 - num_blocks: int = 8 - num_transition_layers: int = 1 - num_resnet_blocks: int = 2 - num_angles: int = 7 - trans_scale_factor: int = 10 - epsilon: float = 1e-8 - inf: float = 1e5 - - def to_dict(self): - return asdict(self) - - -@dataclass -class TrunkConfig: - num_blocks: int = 48 - sequence_state_dim: int = 1024 - pairwise_state_dim: int = 128 - sequence_head_width: int = 32 - pairwise_head_width: int = 32 - position_bins: int = 32 - dropout: float = 0 - layer_drop: float = 0 - cpu_grad_checkpoint: bool = False - max_recycles: int = 4 + sequence_dim: int | None = 384 + pairwise_dim: int | None = 128 + ipa_dim: int | None = 16 + resnet_dim: int | None = 128 + num_heads_ipa: int | None = 12 + num_qk_points: int | None = 4 + num_v_points: int | None = 8 + dropout_rate: float | None = 0.1 + num_blocks: int | None = 8 + num_transition_layers: int | None = 1 + num_resnet_blocks: int | None = 2 + num_angles: int | None = 7 + trans_scale_factor: int | None = 10 + epsilon: float | None = 1e-8 + inf: float | None = 1e5 + + +@strict(accept_kwargs=True) +class TrunkConfig(PreTrainedConfig): + sub_configs = {"structure_module": StructureModuleConfig} + + num_blocks: int | None = 48 + sequence_state_dim: int | None = 1024 + pairwise_state_dim: int | None = is_divisible_by(divisor=2)(default=128) + sequence_head_width: int | None = 32 + pairwise_head_width: int | None = 32 + position_bins: int | None = 32 + dropout: float | int | None = interval(max=0.4)(default=0.0) + layer_drop: float | int | None = 0.0 + cpu_grad_checkpoint: bool | None = False + max_recycles: int | None = interval(min=0)(default=4) chunk_size: int | None = 128 - structure_module: "StructureModuleConfig" = None + structure_module: Union[dict, "StructureModuleConfig"] | None = None - def __post_init__(self): + def __post_init__(self, **kwargs): if self.structure_module is None: self.structure_module = StructureModuleConfig() elif isinstance(self.structure_module, dict): self.structure_module = StructureModuleConfig(**self.structure_module) + super().__post_init__(**kwargs) - if self.max_recycles <= 0: - raise ValueError(f"`max_recycles` should be positive, got {self.max_recycles}.") + def validate_architecture(self): + if self.sequence_state_dim % self.sequence_state_dim != 0: + raise ValueError( + "`sequence_state_dim` should be a round multiple of `sequence_state_dim`, got" + f" {self.sequence_state_dim} and {self.sequence_state_dim}." + ) + if self.pairwise_state_dim % self.pairwise_state_dim != 0: + raise ValueError( + "`pairwise_state_dim` should be a round multiple of `pairwise_state_dim`, got" + f" {self.pairwise_state_dim} and {self.pairwise_state_dim}." + ) sequence_num_heads = self.sequence_state_dim // self.sequence_head_width pairwise_num_heads = self.pairwise_state_dim // self.pairwise_head_width @@ -115,58 +127,33 @@ def __post_init__(self): "`pairwise_state_dim` should be equal to `pairwise_num_heads * pairwise_head_width, got" f" {self.pairwise_state_dim} != {pairwise_num_heads} * {self.pairwise_head_width}." ) - if self.pairwise_state_dim % 2 != 0: - raise ValueError(f"`pairwise_state_dim` should be even, got {self.pairwise_state_dim}.") - - if self.dropout >= 0.4: - raise ValueError(f"`dropout` should not be greater than 0.4, got {self.dropout}.") - - def to_dict(self): - """ - Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. - Returns: - `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, - """ - output = asdict(self) - output["structure_module"] = self.structure_module.to_dict() - return output +@strict(accept_kwargs=True) +class EsmFoldConfig(PreTrainedConfig): + sub_configs = {"trunk": TrunkConfig} -@dataclass -class EsmFoldConfig: esm_type: str | None = None - fp16_esm: bool = True - use_esm_attn_map: bool = False - esm_ablate_pairwise: bool = False - esm_ablate_sequence: bool = False - esm_input_dropout: float = 0 - - embed_aa: bool = True - bypass_lm: bool = False - - lddt_head_hid_dim: int = 128 - trunk: "TrunkConfig" = None - - def __post_init__(self): + fp16_esm: bool | None = True + use_esm_attn_map: bool | None = False + esm_ablate_pairwise: bool | None = False + esm_ablate_sequence: bool | None = False + esm_input_dropout: float | int | None = 0.0 + embed_aa: bool | None = True + bypass_lm: bool | None = False + lddt_head_hid_dim: int | None = 128 + trunk: Union[dict, "TrunkConfig"] | None = None + + def __post_init__(self, **kwargs): if self.trunk is None: self.trunk = TrunkConfig() elif isinstance(self.trunk, dict): self.trunk = TrunkConfig(**self.trunk) - - def to_dict(self): - """ - Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. - - Returns: - `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, - """ - output = asdict(self) - output["trunk"] = self.trunk.to_dict() - return output + super().__post_init__(**kwargs) @auto_docstring(checkpoint="facebook/esm-1b") +@strict(accept_kwargs=True) class EsmConfig(PreTrainedConfig): r""" is_folding_model (`bool`, defaults to `False`): @@ -203,84 +190,48 @@ class EsmConfig(PreTrainedConfig): model_type = "esm" sub_configs = {"esmfold_config": EsmFoldConfig} - def __init__( - self, - vocab_size=None, - mask_token_id=None, - pad_token_id=None, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=1026, - initializer_range=0.02, - layer_norm_eps=1e-12, - position_embedding_type="absolute", - use_cache=True, - emb_layer_norm_before=None, - token_dropout=False, - is_folding_model=False, - esmfold_config=None, - vocab_list=None, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.mask_token_id = mask_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type - self.use_cache = use_cache - self.emb_layer_norm_before = emb_layer_norm_before - self.token_dropout = token_dropout - self.is_folding_model = is_folding_model - if is_folding_model: - if esmfold_config is None: + vocab_size: int | None = None + mask_token_id: int | None = None + pad_token_id: int | None = None + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_dropout_prob: float | None = 0.1 + attention_probs_dropout_prob: float | None = 0.1 + max_position_embeddings: int = 1026 + initializer_range: float = 0.02 + layer_norm_eps: float | None = 1e-12 + position_embedding_type: str | None = "absolute" + use_cache: bool = True + emb_layer_norm_before: bool | None = None + token_dropout: bool | None = False + is_folding_model: bool | None = False + esmfold_config: dict | EsmFoldConfig | None = None + vocab_list: list[str] | tuple[str, ...] | None = None + is_decoder: bool | None = False + add_cross_attention: bool | None = False + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.is_folding_model: + if self.esmfold_config is None: logger.info("No esmfold_config supplied for folding model, using default values.") - esmfold_config = EsmFoldConfig() - elif isinstance(esmfold_config, dict): - esmfold_config = EsmFoldConfig(**esmfold_config) - self.esmfold_config = esmfold_config - if vocab_list is None: + self.esmfold_config = EsmFoldConfig() + elif isinstance(self.esmfold_config, dict): + self.esmfold_config = EsmFoldConfig(**self.esmfold_config) + + if self.vocab_list is None: logger.warning("No vocab_list supplied for folding model, assuming the ESM-2 vocabulary!") self.vocab_list = get_default_vocab_list() - else: - self.vocab_list = vocab_list else: self.esmfold_config = None self.vocab_list = None + if self.esmfold_config is not None and getattr(self.esmfold_config, "use_esm_attn_map", False): raise ValueError("The HuggingFace port of ESMFold does not support use_esm_attn_map at this time!") - # TODO: update ESM to inherit from PreTrainedConfig - def to_dict(self): - """ - Serializes this instance to a Python dictionary. Override the default [`~PreTrainedConfig.to_dict`]. - - Returns: - `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, - """ - output = super().to_dict() - if isinstance(self.esmfold_config, EsmFoldConfig): - output["esmfold_config"] = self.esmfold_config.to_dict() - return output + super().__post_init__(**kwargs) def get_default_vocab_list(): diff --git a/src/transformers/models/eurobert/configuration_eurobert.py b/src/transformers/models/eurobert/configuration_eurobert.py index 249675a62dbe..cc03bc884fcd 100644 --- a/src/transformers/models/eurobert/configuration_eurobert.py +++ b/src/transformers/models/eurobert/configuration_eurobert.py @@ -47,63 +47,34 @@ class EuroBertConfig(LlamaConfig): model_type = "eurobert" - def __init__( - self, - vocab_size=128256, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=8192, - initializer_range=0.02, - rms_norm_eps=1e-05, - bos_token_id=128000, - eos_token_id=128001, - pad_token_id=128001, - mask_token_id=128002, - pretraining_tp=1, - tie_word_embeddings=False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=None, - classifier_pooling="late", - **kwargs, - ): - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - kwargs.pop("use_cache", None) # use_cache=True is not supported for EuroBert + vocab_size: int = 128256 + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + bos_token_id: int | None = 128000 + eos_token_id: int | None = 128001 + pad_token_id: int | None = 128001 + mask_token_id: int = 128002 + pretraining_tp: int = 1 + tie_word_embeddings: int = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float = 0.0 + mlp_bias: bool = False + head_dim: int | None = None + classifier_pooling: str = "late" + is_causal: bool = False - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=False, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - pretraining_tp=pretraining_tp, - tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - mlp_bias=mlp_bias, - head_dim=head_dim, - **kwargs, - ) - self.mask_token_id = mask_token_id - self.classifier_pooling = classifier_pooling - self.is_causal = False + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["EuroBertConfig"] diff --git a/src/transformers/models/eurobert/modular_eurobert.py b/src/transformers/models/eurobert/modular_eurobert.py index 7b212043f1c8..25a409ed7468 100644 --- a/src/transformers/models/eurobert/modular_eurobert.py +++ b/src/transformers/models/eurobert/modular_eurobert.py @@ -50,63 +50,34 @@ class EuroBertConfig(LlamaConfig): model_type = "eurobert" - def __init__( - self, - vocab_size=128256, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=8192, - initializer_range=0.02, - rms_norm_eps=1e-05, - bos_token_id=128000, - eos_token_id=128001, - pad_token_id=128001, - mask_token_id=128002, - pretraining_tp=1, - tie_word_embeddings=False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=None, - classifier_pooling="late", - **kwargs, - ): - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - kwargs.pop("use_cache", None) # use_cache=True is not supported for EuroBert - - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=False, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - pretraining_tp=pretraining_tp, - tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - mlp_bias=mlp_bias, - head_dim=head_dim, - **kwargs, - ) - self.mask_token_id = mask_token_id - self.classifier_pooling = classifier_pooling - self.is_causal = False + vocab_size: int = 128256 + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + bos_token_id: int | None = 128000 + eos_token_id: int | None = 128001 + pad_token_id: int | None = 128001 + mask_token_id: int = 128002 + pretraining_tp: int = 1 + tie_word_embeddings: int = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float = 0.0 + mlp_bias: bool = False + head_dim: int | None = None + classifier_pooling: str = "late" + is_causal: bool = False + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) class EuroBertRMSNorm(LlamaRMSNorm): diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index 227c4407f03d..7b14c61964d6 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -13,6 +13,8 @@ # limitations under the License. """Evolla model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -22,6 +24,7 @@ @auto_docstring(checkpoint="westlake-repl/Evolla-10B-hf") +@strict(accept_kwargs=True) class SaProtConfig(PreTrainedConfig): r""" mask_token_id (`int`, *optional*, defaults to 4): @@ -33,49 +36,27 @@ class SaProtConfig(PreTrainedConfig): token_dropout (`bool`, *optional*, defaults to `True`): Whether to apply dropout to the tokens in the protein sequence model.""" - def __init__( - self, - vocab_size=446, - mask_token_id=4, - pad_token_id=1, - hidden_size=1280, - num_hidden_layers=33, - num_attention_heads=20, - intermediate_size=5120, - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=1026, - initializer_range=0.02, - layer_norm_eps=1e-05, - position_embedding_type="rotary", - emb_layer_norm_before=False, - token_dropout=True, - is_decoder=False, - add_cross_attention=False, - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.mask_token_id = mask_token_id - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type - self.emb_layer_norm_before = emb_layer_norm_before - self.token_dropout = token_dropout + vocab_size: int = 446 + mask_token_id: int = 4 + pad_token_id: int = 1 + hidden_size: int = 1280 + num_hidden_layers: int = 33 + num_attention_heads: int = 20 + intermediate_size: int = 5120 + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 1026 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-05 + position_embedding_type: str = "rotary" + emb_layer_norm_before: bool = False + token_dropout: bool = True + is_decoder: bool = False + add_cross_attention: bool = False @auto_docstring(checkpoint="westlake-repl/Evolla-10B-hf") +@strict(accept_kwargs=True) class EvollaConfig(PreTrainedConfig): r""" protein_encoder_config (`dict`, *optional*): @@ -118,81 +99,45 @@ class EvollaConfig(PreTrainedConfig): sub_configs = {"protein_encoder_config": SaProtConfig} default_theta = 500000.0 - def __init__( - self, - protein_encoder_config: dict | None = None, - vocab_size: int | None = 128256, # llama vocab size - hidden_size: int | None = 4096, # llama hidden size - intermediate_size: int | None = 14336, # llama intermediate size - num_hidden_layers: int | None = 32, # llama num layers - num_attention_heads: int | None = 32, # llama num heads - num_key_value_heads: int | None = 8, # llama num key-value heads - hidden_act: str | None = "silu", # llama activation function - max_position_embeddings: int | None = 8192, # llama rope max length - rms_norm_eps: int | None = 1e-05, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - aligner_ffn_mult: int | None = 4, - aligner_enable_bias: bool | None = True, - aligner_attention_probs_dropout_prob: float | None = 0.1, - aligner_num_add_layers: int | None = 8, - resampler_depth: int | None = 6, - resampler_dim_head: int | None = 64, - resampler_heads: int | None = 8, - resampler_num_latents: int | None = 64, - resampler_ff_mult: int | None = 4, - initializer_range: float | None = 0.02, - pad_token_id: int | None = None, - bos_token_id: int | None = 128000, - eos_token_id: int | None = 128009, - use_cache: bool | None = False, - tie_word_embeddings: bool | None = False, - is_decoder: bool | None = False, - add_cross_attention: bool | None = False, - **kwargs, - ): - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.rms_norm_eps = rms_norm_eps - self.tie_word_embeddings = tie_word_embeddings - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.aligner_ffn_mult = aligner_ffn_mult - self.aligner_enable_bias = aligner_enable_bias - self.aligner_attention_probs_dropout_prob = aligner_attention_probs_dropout_prob - self.aligner_num_add_layers = aligner_num_add_layers - self.use_cache = use_cache - self.initializer_range = initializer_range - - self.resampler_depth = resampler_depth - self.resampler_dim_head = resampler_dim_head - self.resampler_heads = resampler_heads - self.resampler_num_latents = resampler_num_latents - self.resampler_ff_mult = resampler_ff_mult - self.rope_parameters = rope_parameters - - # Subconfig - if protein_encoder_config is None: - protein_encoder_config = {} + protein_encoder_config: dict | PreTrainedConfig | None = None + vocab_size: int = 128256 # llama vocab size + hidden_size: int = 4096 # llama hidden size + intermediate_size: int = 14336 # llama intermediate size + num_hidden_layers: int = 32 # llama num layers + num_attention_heads: int = 32 # llama num heads + num_key_value_heads: int | None = 8 # llama num key-value heads + hidden_act: str = "silu" # llama activation function + max_position_embeddings: int = 8192 # llama rope max length + rms_norm_eps: float = 1e-05 + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + mlp_bias: bool = False + aligner_ffn_mult: int | None = 4 + aligner_enable_bias: bool | None = True + aligner_attention_probs_dropout_prob: float | None = 0.1 + aligner_num_add_layers: int | None = 8 + resampler_depth: int | None = 6 + resampler_dim_head: int | None = 64 + resampler_heads: int | None = 8 + resampler_num_latents: int | None = 64 + resampler_ff_mult: int | None = 4 + initializer_range: float = 0.02 + pad_token_id: int | None = None + bos_token_id: int | None = 128000 + eos_token_id: int | list[int] | None = 128009 + use_cache: bool = False + tie_word_embeddings: bool = False + is_decoder: bool | None = False + add_cross_attention: bool | None = False + + def __post_init__(self, **kwargs): + if self.protein_encoder_config is None: + self.protein_encoder_config = SaProtConfig() logger.info("`protein_encoder_config` is `None`. Initializing the `SaProtConfig` with default values.") - self.protein_encoder_config = SaProtConfig(**protein_encoder_config) - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + elif isinstance(self.protein_encoder_config, dict): + self.protein_encoder_config = SaProtConfig(**self.protein_encoder_config) + super().__post_init__(**kwargs) __all__ = ["EvollaConfig"] diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index 1076806a6740..8261103ab15d 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -18,12 +18,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="LGAI-EXAONE/EXAONE-4.0-32B") +@strict(accept_kwargs=True) class Exaone4Config(PreTrainedConfig): r""" sliding_window_pattern (`str`, *optional*): @@ -73,64 +76,39 @@ class Exaone4Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 102400, - hidden_size: int | None = 4096, - intermediate_size: int | None = 16384, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 32, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - bos_token_id: int | None = 0, - eos_token_id: int | None = 2, - pad_token_id: int | None = None, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_dropout: float | None = 0.0, - sliding_window: int | None = 4096, - sliding_window_pattern: int | None = 4, - layer_types: list[str] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.sliding_window = sliding_window - self.sliding_window_pattern = sliding_window_pattern - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings + vocab_size: int = 102400 + hidden_size: int = 4096 + intermediate_size: int = 16384 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 32 + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + pad_token_id: int | None = None + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_dropout: float | int = 0.0 + sliding_window: int | None = 4096 + sliding_window_pattern: str | int | None = 4 + layer_types: list[str] | None = None - self.layer_types = layer_types + def __post_init__(self, **kwargs): if self.sliding_window is None: - sliding_window_pattern = 0 + self.sliding_window_pattern = 0 if self.layer_types is None: self.layer_types = [ "sliding_attention" - if ((i + 1) % (sliding_window_pattern) != 0 and i < self.num_hidden_layers) + if ((i + 1) % (self.sliding_window_pattern) != 0 and i < self.num_hidden_layers) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Exaone4Config"] diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index a175a04c9ab8..3907be48aa6f 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -17,10 +17,11 @@ from collections.abc import Callable import torch +from huggingface_hub.dataclasses import strict from torch import nn from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import ( BaseModelOutputWithPast, @@ -54,6 +55,7 @@ @auto_docstring(checkpoint="LGAI-EXAONE/EXAONE-4.0-32B") +@strict(accept_kwargs=True) class Exaone4Config(PreTrainedConfig): r""" sliding_window_pattern (`str`, *optional*): @@ -103,64 +105,39 @@ class Exaone4Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 102400, - hidden_size: int | None = 4096, - intermediate_size: int | None = 16384, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 32, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - bos_token_id: int | None = 0, - eos_token_id: int | None = 2, - pad_token_id: int | None = None, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_dropout: float | None = 0.0, - sliding_window: int | None = 4096, - sliding_window_pattern: int | None = 4, - layer_types: list[str] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.sliding_window = sliding_window - self.sliding_window_pattern = sliding_window_pattern - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.layer_types = layer_types + vocab_size: int = 102400 + hidden_size: int = 4096 + intermediate_size: int = 16384 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 32 + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + pad_token_id: int | None = None + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_dropout: float | int = 0.0 + sliding_window: int | None = 4096 + sliding_window_pattern: str | int | None = 4 + layer_types: list[str] | None = None + + def __post_init__(self, **kwargs): if self.sliding_window is None: - sliding_window_pattern = 0 + self.sliding_window_pattern = 0 if self.layer_types is None: self.layer_types = [ "sliding_attention" - if ((i + 1) % (sliding_window_pattern) != 0 and i < self.num_hidden_layers) + if ((i + 1) % (self.sliding_window_pattern) != 0 and i < self.num_hidden_layers) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) class Exaone4RMSNorm(LlamaRMSNorm): diff --git a/src/transformers/models/exaone_moe/configuration_exaone_moe.py b/src/transformers/models/exaone_moe/configuration_exaone_moe.py index 05d998e8595c..788c58044652 100644 --- a/src/transformers/models/exaone_moe/configuration_exaone_moe.py +++ b/src/transformers/models/exaone_moe/configuration_exaone_moe.py @@ -18,11 +18,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="LGAI-EXAONE/K-EXAONE-236B-A23B") +@strict(accept_kwargs=True) class ExaoneMoeConfig(PreTrainedConfig): r""" n_group (`int`, *optional*, defaults to 1): @@ -79,90 +82,53 @@ class ExaoneMoeConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size=102400, - hidden_size=4096, - intermediate_size=16384, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - bos_token_id=1, - eos_token_id=53, - pad_token_id=0, - tie_word_embeddings=False, - rope_parameters=None, - attention_dropout=0.0, - sliding_window=4096, - sliding_window_pattern=4, - layer_types=None, - mlp_layer_types=None, - first_k_dense_replace=1, - moe_intermediate_size=1024, - num_experts=64, - num_experts_per_tok=8, - num_shared_experts=1, - norm_topk_prob=True, - routed_scaling_factor=2.5, - n_group=1, - topk_group=1, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.sliding_window = sliding_window - self.sliding_window_pattern = sliding_window_pattern - self.first_k_dense_replace = first_k_dense_replace - self.moe_intermediate_size = moe_intermediate_size - self.num_experts = num_experts - self.num_experts_per_tok = num_experts_per_tok - self.num_shared_experts = num_shared_experts - self.norm_topk_prob = norm_topk_prob - self.routed_scaling_factor = routed_scaling_factor - self.n_group = n_group - self.topk_group = topk_group - self.rope_parameters = rope_parameters + vocab_size: int = 102400 + hidden_size: int = 4096 + intermediate_size: int = 16384 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 32 + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + bos_token_id: int | None = 1 + eos_token_id: int | None = 53 + pad_token_id: int | None = 0 + tie_word_embeddings: bool = False + rope_parameters: dict | None = None + attention_dropout: float | int = 0.0 + sliding_window: int = 4096 + sliding_window_pattern: str | int | None = 4 + layer_types: list[str] | None = None + mlp_layer_types: list[str] | None = None + first_k_dense_replace: int = 1 + moe_intermediate_size: int = 1024 + num_experts: int = 64 + num_experts_per_tok: int = 8 + num_shared_experts: int = 1 + norm_topk_prob: bool = True + routed_scaling_factor: float = 2.5 + n_group: int = 1 + topk_group: int = 1 - self.layer_types = layer_types + def __post_init__(self, **kwargs): + if self.mlp_layer_types is None: + self.mlp_layer_types = [ + "dense" if i < self.first_k_dense_replace else "sparse" for i in range(self.num_hidden_layers) + ] if self.sliding_window is None: - sliding_window_pattern = 0 + self.sliding_window_pattern = 0 if self.layer_types is None: self.layer_types = [ "sliding_attention" - if ((i + 1) % (sliding_window_pattern) != 0 and i < self.num_hidden_layers) + if ((i + 1) % (self.sliding_window_pattern) != 0 and i < self.num_hidden_layers) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types) - - self.mlp_layer_types = mlp_layer_types - if self.mlp_layer_types is None: - self.mlp_layer_types = [ - "dense" if i < self.first_k_dense_replace else "sparse" for i in range(self.num_hidden_layers) - ] - layer_type_validation(self.mlp_layer_types, self.num_hidden_layers, attention=False) - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["ExaoneMoeConfig"] diff --git a/src/transformers/models/exaone_moe/modular_exaone_moe.py b/src/transformers/models/exaone_moe/modular_exaone_moe.py index 2de2cfc5d26d..3b70ccca4f70 100644 --- a/src/transformers/models/exaone_moe/modular_exaone_moe.py +++ b/src/transformers/models/exaone_moe/modular_exaone_moe.py @@ -16,10 +16,10 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...cache_utils import Cache -from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...modeling_outputs import CausalLMOutputWithPast from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack @@ -43,6 +43,7 @@ @auto_docstring(checkpoint="LGAI-EXAONE/K-EXAONE-236B-A23B") +@strict(accept_kwargs=True) class ExaoneMoeConfig(Exaone4Config): r""" n_group (`int`, *optional*, defaults to 1): @@ -79,92 +80,44 @@ class ExaoneMoeConfig(Exaone4Config): >>> configuration = model.config ```""" - model_type = "exaone_moe" - - def __init__( - self, - vocab_size=102400, - hidden_size=4096, - intermediate_size=16384, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - bos_token_id=1, - eos_token_id=53, - pad_token_id=0, - tie_word_embeddings=False, - rope_parameters=None, - attention_dropout=0.0, - sliding_window=4096, - sliding_window_pattern=4, - layer_types=None, - mlp_layer_types=None, - first_k_dense_replace=1, - moe_intermediate_size=1024, - num_experts=64, - num_experts_per_tok=8, - num_shared_experts=1, - norm_topk_prob=True, - routed_scaling_factor=2.5, - n_group=1, - topk_group=1, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.sliding_window = sliding_window - self.sliding_window_pattern = sliding_window_pattern - self.first_k_dense_replace = first_k_dense_replace - self.moe_intermediate_size = moe_intermediate_size - self.num_experts = num_experts - self.num_experts_per_tok = num_experts_per_tok - self.num_shared_experts = num_shared_experts - self.norm_topk_prob = norm_topk_prob - self.routed_scaling_factor = routed_scaling_factor - self.n_group = n_group - self.topk_group = topk_group - self.rope_parameters = rope_parameters - - self.layer_types = layer_types - if self.sliding_window is None: - sliding_window_pattern = 0 - if self.layer_types is None: - self.layer_types = [ - "sliding_attention" - if ((i + 1) % (sliding_window_pattern) != 0 and i < self.num_hidden_layers) - else "full_attention" - for i in range(self.num_hidden_layers) - ] - layer_type_validation(self.layer_types) - - self.mlp_layer_types = mlp_layer_types + vocab_size: int = 102400 + hidden_size: int = 4096 + intermediate_size: int = 16384 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 32 + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + bos_token_id: int | None = 1 + eos_token_id: int | None = 53 + pad_token_id: int | None = 0 + tie_word_embeddings: bool = False + rope_parameters: dict | None = None + attention_dropout: float | int = 0.0 + sliding_window: int = 4096 + sliding_window_pattern: str | int | None = 4 + layer_types: list[str] | None = None + mlp_layer_types: list[str] | None = None + first_k_dense_replace: int = 1 + moe_intermediate_size: int = 1024 + num_experts: int = 64 + num_experts_per_tok: int = 8 + num_shared_experts: int = 1 + norm_topk_prob: bool = True + routed_scaling_factor: float = 2.5 + n_group: int = 1 + topk_group: int = 1 + + def __post_init__(self, **kwargs): if self.mlp_layer_types is None: self.mlp_layer_types = [ "dense" if i < self.first_k_dense_replace else "sparse" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.mlp_layer_types, self.num_hidden_layers, attention=False) - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - PreTrainedConfig.__init__(**kwargs) + super().__post_init__(**kwargs) class ExaoneMoeAttention(Exaone4Attention): diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 0838d2f7e217..8a8e18578816 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -13,15 +13,15 @@ # limitations under the License. """Falcon configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="tiiuae/falcon-7b") +@strict(accept_kwargs=True) class FalconConfig(PreTrainedConfig): r""" num_ln_in_parallel_attn (`int`, *optional*): @@ -63,66 +63,40 @@ class FalconConfig(PreTrainedConfig): model_type = "falcon" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 65024, - hidden_size: int | None = 4544, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 71, - num_ln_in_parallel_attn: int | None = None, - layer_norm_epsilon: int | None = 1e-5, - initializer_range: float | None = 0.02, - use_cache: bool | None = True, - hidden_dropout: float | None = 0.0, - attention_dropout: float | None = 0.0, - num_kv_heads: int | None = None, - alibi: bool | None = False, - new_decoder_architecture: bool | None = False, - multi_query: bool | None = True, - parallel_attn: bool | None = True, - bias: bool | None = False, - max_position_embeddings: int | None = 2048, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - bos_token_id: int | None = 11, - eos_token_id: int | None = 11, - pad_token_id: int | None = None, - ffn_hidden_size: int | None = None, - activation: str | None = "gelu", - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.vocab_size = vocab_size + vocab_size: int = 65024 + hidden_size: int = 4544 + num_hidden_layers: int = 32 + num_attention_heads: int = 71 + num_ln_in_parallel_attn: int | None = None + layer_norm_epsilon: float | None = 1e-5 + initializer_range: float = 0.02 + use_cache: bool = True + hidden_dropout: float | int | None = 0.0 + attention_dropout: float | int | None = 0.0 + num_kv_heads: int | None = None + alibi: bool | None = False + new_decoder_architecture: bool | None = False + multi_query: bool | None = True + parallel_attn: bool | None = True + bias: bool | None = False + max_position_embeddings: int = 2048 + rope_parameters: RopeParameters | dict | None = None + bos_token_id: int | None = 11 + eos_token_id: int | list[int] | None = 11 + pad_token_id: int | None = None + ffn_hidden_size: int | None = None + activation: str | None = "gelu" + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): # Backward compatibility with n_embed kwarg n_embed = kwargs.pop("n_embed", None) - self.hidden_size = hidden_size if n_embed is None else n_embed - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.use_cache = use_cache - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads - self.alibi = alibi - self.new_decoder_architecture = new_decoder_architecture - self.multi_query = multi_query # Ignored when new_decoder_architecture is True - self.parallel_attn = parallel_attn - self.bias = bias - self.num_ln_in_parallel_attn = num_ln_in_parallel_attn - self.max_position_embeddings = max_position_embeddings - self.activation = activation - self.tie_word_embeddings = tie_word_embeddings - if ffn_hidden_size is None: - self.ffn_hidden_size = hidden_size * 4 - else: - self.ffn_hidden_size = ffn_hidden_size - - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + self.hidden_size = self.hidden_size if n_embed is None else n_embed + self.num_kv_heads = self.num_attention_heads if self.num_kv_heads is None else self.num_kv_heads + if self.ffn_hidden_size is None: + self.ffn_hidden_size = self.hidden_size * 4 + + super().__post_init__(**kwargs) @property def head_dim(self): diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index cd7e2b569026..43a48f21fe99 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -737,7 +737,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -902,7 +902,7 @@ def forward( are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -999,7 +999,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1124,7 +1124,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1197,7 +1197,7 @@ def forward( [What are input IDs?](../glossary#input-ids) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index 2dc8399d3a39..caa0e6288528 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -13,15 +13,15 @@ # limitations under the License. """FalconH1 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="tiiuae/Falcon-H1-1.5B-Deep-Instruct") +@strict(accept_kwargs=True) class FalconH1Config(PreTrainedConfig): r""" num_logits_to_keep (`int` or `None`, *optional*, defaults to 1): @@ -56,148 +56,79 @@ class FalconH1Config(PreTrainedConfig): model_type = "falcon_h1" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 128000, - tie_word_embeddings: bool | None = False, - hidden_size: int | None = 4096, - intermediate_size: int | None = 14336, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: int | None = True, - num_logits_to_keep: int | None = 1, - pad_token_id: int | None = 0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - max_position_embeddings: int | None = 8192, - attention_dropout: float | None = 0.0, - mamba_d_ssm: int | None = 1024, - mamba_n_heads: int | None = 128, - mamba_d_head: str | None = "auto", - mamba_n_groups: int | None = 1, - mamba_d_state: int | None = 256, - mamba_d_conv: int | None = 4, - mamba_expand: int | None = 2, - mamba_chunk_size: int | None = 256, - mamba_conv_bias: bool | None = True, - mamba_proj_bias: bool | None = False, - mamba_norm_before_gate: bool | None = True, - mamba_rms_norm: bool | None = False, - time_step_min: float | None = 0.001, - time_step_max: float | None = 0.1, - time_step_limit: tuple[float, float] | None = (0.0, float("inf")), - projectors_bias: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - lm_head_multiplier: float | None = 1.0, - embedding_multiplier: float | None = 1.0, - mlp_multipliers: int | None = None, - key_multiplier: int | None = None, - attention_out_multiplier: int | None = None, - attention_in_multiplier: int | None = None, - ssm_multipliers: int | None = None, - ssm_in_multiplier: int | None = None, - ssm_out_multiplier: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.attention_dropout = attention_dropout - self.attention_bias = False - self.mlp_bias = False - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - - self.use_cache = use_cache - self.num_logits_to_keep = num_logits_to_keep - self.projectors_bias = projectors_bias - mamba_intermediate = mamba_expand * hidden_size if mamba_d_ssm is None else mamba_d_ssm - - if mamba_intermediate % mamba_n_heads != 0: - raise ValueError("mamba_n_heads must divide mamba_expand * hidden_size") + vocab_size: int = 128000 + tie_word_embeddings: bool = False + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 8 + hidden_act: str = "silu" + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: int | None = True + num_logits_to_keep: int | None = 1 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + max_position_embeddings: int = 8192 + attention_dropout: float | int | None = 0.0 + mamba_d_ssm: int | None = 1024 + mamba_n_heads: int | None = 128 + mamba_d_head: str | int | None = "auto" + mamba_n_groups: int | None = 1 + mamba_d_state: int | None = 256 + mamba_d_conv: int | None = 4 + mamba_expand: int | None = 2 + mamba_chunk_size: int | None = 256 + mamba_conv_bias: bool | None = True + mamba_proj_bias: bool | None = False + mamba_norm_before_gate: bool | None = True + mamba_rms_norm: bool | None = False + time_step_min: float | None = 0.001 + time_step_max: float | None = 0.1 + time_step_limit: list[float, float] | tuple[float, float] | None = (0.0, float("inf")) + projectors_bias: bool | None = False + rope_parameters: RopeParameters | dict | None = None + lm_head_multiplier: float | None = 1.0 + embedding_multiplier: float | None = 1.0 + mlp_multipliers: list[float] | None = None + key_multiplier: float | None = 1.0 + attention_out_multiplier: float | None = 1.0 + attention_in_multiplier: float | None = 1.0 + ssm_multipliers: list[float] | None = None + ssm_in_multiplier: float | None = 1.0 + ssm_out_multiplier: float | None = 1.0 + attention_bias: bool = False + mlp_bias: bool = False + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads # for the mamba_v2, must satisfy the following - if mamba_d_head == "auto": - mamba_d_head = mamba_intermediate // mamba_n_heads - - if mamba_d_head * mamba_n_heads != mamba_intermediate: - raise ValueError("The dimensions for the Mamba head state do not match the model intermediate_size") + mamba_intermediate = self.mamba_expand * self.hidden_size if self.mamba_d_ssm is None else self.mamba_d_ssm + if self.mamba_d_head == "auto": + self.mamba_d_head = mamba_intermediate // self.mamba_n_heads - self.mamba_d_ssm = mamba_d_ssm - self.mamba_n_heads = mamba_n_heads - self.mamba_d_head = mamba_d_head - self.mamba_n_groups = mamba_n_groups - self.mamba_d_state = mamba_d_state - self.mamba_d_conv = mamba_d_conv - self.mamba_expand = mamba_expand - self.mamba_chunk_size = mamba_chunk_size - self.mamba_conv_bias = mamba_conv_bias - self.mamba_proj_bias = mamba_proj_bias - - self.mamba_norm_before_gate = mamba_norm_before_gate - self.mamba_rms_norm = mamba_rms_norm - self.time_step_min = time_step_min - self.time_step_max = time_step_max - self.time_step_limit = tuple(time_step_limit) if time_step_limit is not None else None - - self.lm_head_multiplier = lm_head_multiplier - self.embedding_multiplier = embedding_multiplier - - if mlp_multipliers is not None: - self.mlp_multipliers = mlp_multipliers - else: + self.time_step_limit = tuple(self.time_step_limit) if self.time_step_limit is not None else None + if self.mlp_multipliers is None: self.mlp_multipliers = [1.0, 1.0] - if attention_out_multiplier is not None: - self.attention_out_multiplier = attention_out_multiplier - else: - self.attention_out_multiplier = 1.0 + if self.ssm_multipliers is None: + self.ssm_multipliers = [1.0, 1.0, 1.0, 1.0, 1.0] - if attention_in_multiplier is not None: - self.attention_in_multiplier = attention_in_multiplier - else: - self.attention_in_multiplier = 1.0 + super().__post_init__(**kwargs) - if key_multiplier is not None: - self.key_multiplier = key_multiplier - else: - self.key_multiplier = 1.0 + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + mamba_intermediate = self.mamba_expand * self.hidden_size if self.mamba_d_ssm is None else self.mamba_d_ssm - if ssm_multipliers is not None: - self.ssm_multipliers = ssm_multipliers - else: - self.ssm_multipliers = [1.0, 1.0, 1.0, 1.0, 1.0] + if mamba_intermediate % self.mamba_n_heads != 0: + raise ValueError("mamba_n_heads must divide mamba_expand * hidden_size") - if ssm_in_multiplier is not None: - self.ssm_in_multiplier = ssm_in_multiplier - else: - self.ssm_in_multiplier = 1.0 - - if ssm_out_multiplier is not None: - self.ssm_out_multiplier = ssm_out_multiplier - else: - self.ssm_out_multiplier = 1.0 - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + if self.mamba_d_head * self.mamba_n_heads != mamba_intermediate: + raise ValueError("The dimensions for the Mamba head state do not match the model intermediate_size") @property def layers_block_type(self): diff --git a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py index cc7583fcbbd8..0da599e463ad 100644 --- a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py @@ -19,11 +19,14 @@ # limitations under the License. import math +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="tiiuae/falcon-mamba-7b") +@strict(accept_kwargs=True) class FalconMambaConfig(PreTrainedConfig): r""" expand (`int`, *optional*, defaults to 2): @@ -66,72 +69,41 @@ class FalconMambaConfig(PreTrainedConfig): model_type = "falcon_mamba" - def __init__( - self, - vocab_size=50280, - hidden_size=768, - state_size=16, - num_hidden_layers=32, - layer_norm_epsilon=1e-5, - pad_token_id=0, - bos_token_id=0, - eos_token_id=0, - expand=2, - conv_kernel=4, - use_bias=False, - use_conv_bias=True, - hidden_act="silu", - initializer_range=0.1, - residual_in_fp32=True, - time_step_rank="auto", - time_step_scale=1.0, - time_step_min=0.001, - time_step_max=0.1, - time_step_init_scheme="random", - time_step_floor=1e-4, - rescale_prenorm_residual=False, - use_cache=True, - use_falcon_mambapy=False, - use_associative_scan=True, - mixer_rms_eps=1e-6, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.state_size = state_size - self.num_hidden_layers = num_hidden_layers - self.layer_norm_epsilon = layer_norm_epsilon - self.conv_kernel = conv_kernel - self.expand = expand - # This is needed since mamba overrides the intermediate_size attribute - self.intermediate_size = ( - int(expand * self.hidden_size) - if kwargs.get("intermediate_size") is None - else kwargs.get("intermediate_size") - ) - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.use_bias = use_bias - self.use_conv_bias = use_conv_bias - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank - self.time_step_scale = time_step_scale - self.time_step_min = time_step_min - self.time_step_max = time_step_max - self.time_step_init_scheme = time_step_init_scheme - self.time_step_floor = time_step_floor - self.rescale_prenorm_residual = rescale_prenorm_residual - self.residual_in_fp32 = residual_in_fp32 - self.use_cache = use_cache - self.use_falcon_mambapy = use_falcon_mambapy - self.use_associative_scan = use_associative_scan - self.tie_word_embeddings = tie_word_embeddings + vocab_size: int = 50280 + hidden_size: int = 768 + state_size: int = 16 + num_hidden_layers: int = 32 + layer_norm_epsilon: float = 1e-5 + pad_token_id: int | None = 0 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 0 + expand: int = 2 + conv_kernel: int = 4 + use_bias: bool = False + use_conv_bias: bool = True + hidden_act: str = "silu" + initializer_range: float = 0.1 + residual_in_fp32: bool = True + time_step_rank: str | int = "auto" + time_step_scale: float = 1.0 + time_step_min: float = 0.001 + time_step_max: float = 0.1 + time_step_init_scheme: str = "random" + time_step_floor: float = 1e-4 + rescale_prenorm_residual: bool = False + use_cache: bool = True - super().__init__(**kwargs) - self.mixer_rms_eps = mixer_rms_eps + use_falcon_mambapy: bool = False + use_associative_scan: bool = True + tie_word_embeddings: bool = True + mixer_rms_eps: float = 1e-6 + + def __post_init__(self, **kwargs): + self.intermediate_size = int(self.expand * self.hidden_size) + self.time_step_rank = ( + math.ceil(self.hidden_size / 16) if self.time_step_rank == "auto" else self.time_step_rank + ) + super().__post_init__(**kwargs) __all__ = ["FalconMambaConfig"] diff --git a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 61f6040f0c4f..e500622bac24 100644 --- a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -738,7 +738,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if (input_ids is None) ^ (inputs_embeds is not None): # ^ is python for xor raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -903,7 +903,7 @@ def forward( use_cache (`bool`, *optional*): If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict falcon_mamba_outputs = self.backbone( input_ids, diff --git a/src/transformers/models/falcon_mamba/modular_falcon_mamba.py b/src/transformers/models/falcon_mamba/modular_falcon_mamba.py index 5133a54d3044..4e547101ae4a 100644 --- a/src/transformers/models/falcon_mamba/modular_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/modular_falcon_mamba.py @@ -14,6 +14,7 @@ """PyTorch FALCONMAMBA model.""" import torch +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -55,6 +56,7 @@ @auto_docstring(checkpoint="tiiuae/falcon-mamba-7b") +@strict(accept_kwargs=True) class FalconMambaConfig(MambaConfig): r""" expand (`int`, *optional*, defaults to 2): @@ -95,73 +97,9 @@ class FalconMambaConfig(MambaConfig): >>> configuration = model.config ```""" - def __init__( - self, - vocab_size=50280, - hidden_size=768, - state_size=16, - num_hidden_layers=32, - layer_norm_epsilon=1e-5, - pad_token_id=0, - bos_token_id=0, - eos_token_id=0, - expand=2, - conv_kernel=4, - use_bias=False, - use_conv_bias=True, - hidden_act="silu", - initializer_range=0.1, - residual_in_fp32=True, - time_step_rank="auto", - time_step_scale=1.0, - time_step_min=0.001, - time_step_max=0.1, - time_step_init_scheme="random", - time_step_floor=1e-4, - rescale_prenorm_residual=False, - use_cache=True, - use_falcon_mambapy=False, - use_associative_scan=True, - mixer_rms_eps=1e-6, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - state_size=state_size, - num_hidden_layers=num_hidden_layers, - layer_norm_epsilon=layer_norm_epsilon, - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - expand=expand, - conv_kernel=conv_kernel, - use_bias=use_bias, - use_conv_bias=use_conv_bias, - hidden_act=hidden_act, - initializer_range=initializer_range, - residual_in_fp32=residual_in_fp32, - time_step_rank=time_step_rank, - time_step_scale=time_step_scale, - time_step_min=time_step_min, - time_step_max=time_step_max, - time_step_init_scheme=time_step_init_scheme, - time_step_floor=time_step_floor, - rescale_prenorm_residual=rescale_prenorm_residual, - use_cache=use_cache, - use_falcon_mambapy=use_falcon_mambapy, - use_associative_scan=use_associative_scan, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - self.mixer_rms_eps = mixer_rms_eps - # This is needed since mamba overrides the intermediate_size attribute - self.intermediate_size = ( - int(expand * self.hidden_size) - if kwargs.get("intermediate_size") is None - else kwargs.get("intermediate_size") - ) + use_falcon_mambapy: bool = False + use_associative_scan: bool = True + mixer_rms_eps: float = 1e-6 class FalconMambaCache(MambaCache): diff --git a/src/transformers/models/fast_vlm/configuration_fast_vlm.py b/src/transformers/models/fast_vlm/configuration_fast_vlm.py index e605e981e85f..e45baa298f86 100644 --- a/src/transformers/models/fast_vlm/configuration_fast_vlm.py +++ b/src/transformers/models/fast_vlm/configuration_fast_vlm.py @@ -18,12 +18,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="KamilaMila/FastVLM-7B") +@strict(accept_kwargs=True) class FastVlmConfig(PreTrainedConfig): r""" Example: @@ -47,39 +51,22 @@ class FastVlmConfig(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_id=151646, - projector_hidden_act="gelu", - vision_feature_select_strategy="full", - vision_feature_layer=-1, - multimodal_projector_bias=True, - tie_word_embeddings=False, - **kwargs, - ): - self.image_token_id = image_token_id - self.projector_hidden_act = projector_hidden_act - - if vision_feature_select_strategy != "full": - raise ValueError( - f"Unexpected select feature strategy: {vision_feature_select_strategy}. Only 'full' is supported in FastVLM." - ) - - if vision_feature_layer != -1: - raise ValueError( - f"Unexpected vision feature layer: {vision_feature_layer}. Only -1 is supported in FastVLM." - ) - - self.vision_feature_select_strategy = vision_feature_select_strategy - self.vision_feature_layer = vision_feature_layer - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "timm_wrapper") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["timm_wrapper"]( + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 151646 + image_seq_length: int = 576 + projector_hidden_act: str = "gelu" + vision_feature_select_strategy: str = "full" + vision_feature_layer: int | list[int] = -1 + multimodal_projector_bias: bool = True + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "timm_wrapper") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["timm_wrapper"]( architecture="fastvit_mci3", do_pooling=True, global_pool="avg", @@ -88,13 +75,11 @@ def __init__( model_args={"inference_mode": True}, ) - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "qwen2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["qwen2"]( + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["qwen2"]( hidden_size=3584, vocab_size=152128, intermediate_size=18944, @@ -102,18 +87,25 @@ def __init__( num_key_value_heads=4, num_hidden_layers=28, ) - - self.text_config = text_config - self.multimodal_projector_bias = multimodal_projector_bias - self.tie_word_embeddings = tie_word_embeddings - # The default value is `False` but this config is used with many model types # Attr `tie_word_embeddings` was saved in text config for those models, so we # need an ugly workaround and forward-pass the attr from text config - if not tie_word_embeddings and self.text_config.tie_word_embeddings: + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: self.tie_word_embeddings = self.text_config.tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.vision_feature_select_strategy != "full": + raise ValueError( + f"Unexpected select feature strategy: {self.vision_feature_select_strategy}. Only 'full' is supported in FastVLM." + ) + + if self.vision_feature_layer != -1: + raise ValueError( + f"Unexpected vision feature layer: {self.vision_feature_layer}. Only -1 is supported in FastVLM." + ) __all__ = ["FastVlmConfig"] diff --git a/src/transformers/models/fast_vlm/modeling_fast_vlm.py b/src/transformers/models/fast_vlm/modeling_fast_vlm.py index c4b3013c855e..1865ad9800c2 100644 --- a/src/transformers/models/fast_vlm/modeling_fast_vlm.py +++ b/src/transformers/models/fast_vlm/modeling_fast_vlm.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from dataclasses import dataclass import torch @@ -122,7 +123,7 @@ def set_input_embeddings(self, value): def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -180,7 +181,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | FastVlmModelOutputWithPast: @@ -285,7 +286,7 @@ def get_output_embeddings(self) -> nn.Module: def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -306,7 +307,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, labels: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, diff --git a/src/transformers/models/fast_vlm/modular_fast_vlm.py b/src/transformers/models/fast_vlm/modular_fast_vlm.py index 20d7a2279280..514f2b799892 100644 --- a/src/transformers/models/fast_vlm/modular_fast_vlm.py +++ b/src/transformers/models/fast_vlm/modular_fast_vlm.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. + import torch +from huggingface_hub.dataclasses import strict from torch import nn from ...activations import ACT2FN @@ -35,6 +37,7 @@ @auto_docstring(checkpoint="KamilaMila/FastVLM-7B") +@strict(accept_kwargs=True) class FastVlmConfig(LlavaConfig): r""" Example: @@ -54,39 +57,21 @@ class FastVlmConfig(LlavaConfig): model_type = "fast_vlm" - def __init__( - self, - vision_config=None, - text_config=None, - image_token_id=151646, - projector_hidden_act="gelu", - vision_feature_select_strategy="full", - vision_feature_layer=-1, - multimodal_projector_bias=True, - tie_word_embeddings=False, - **kwargs, - ): - self.image_token_id = image_token_id - self.projector_hidden_act = projector_hidden_act - - if vision_feature_select_strategy != "full": - raise ValueError( - f"Unexpected select feature strategy: {vision_feature_select_strategy}. Only 'full' is supported in FastVLM." - ) - - if vision_feature_layer != -1: - raise ValueError( - f"Unexpected vision feature layer: {vision_feature_layer}. Only -1 is supported in FastVLM." - ) - - self.vision_feature_select_strategy = vision_feature_select_strategy - self.vision_feature_layer = vision_feature_layer - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "timm_wrapper") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["timm_wrapper"]( + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 151646 + projector_hidden_act: str = "gelu" + vision_feature_select_strategy: str = "full" + vision_feature_layer: int | list[int] = -1 + multimodal_projector_bias: bool = True + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "timm_wrapper") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["timm_wrapper"]( architecture="fastvit_mci3", do_pooling=True, global_pool="avg", @@ -95,13 +80,11 @@ def __init__( model_args={"inference_mode": True}, ) - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "qwen2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["qwen2"]( + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["qwen2"]( hidden_size=3584, vocab_size=152128, intermediate_size=18944, @@ -109,18 +92,25 @@ def __init__( num_key_value_heads=4, num_hidden_layers=28, ) - - self.text_config = text_config - self.multimodal_projector_bias = multimodal_projector_bias - self.tie_word_embeddings = tie_word_embeddings - # The default value is `False` but this config is used with many model types # Attr `tie_word_embeddings` was saved in text config for those models, so we # need an ugly workaround and forward-pass the attr from text config - if not tie_word_embeddings and self.text_config.tie_word_embeddings: + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: self.tie_word_embeddings = self.text_config.tie_word_embeddings - PreTrainedConfig.__init__(**kwargs) + PreTrainedConfig.__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.vision_feature_select_strategy != "full": + raise ValueError( + f"Unexpected select feature strategy: {self.vision_feature_select_strategy}. Only 'full' is supported in FastVLM." + ) + + if self.vision_feature_layer != -1: + raise ValueError( + f"Unexpected vision feature layer: {self.vision_feature_layer}. Only -1 is supported in FastVLM." + ) class FastVlmMultiModalProjector(LlavaMultiModalProjector): @@ -159,7 +149,7 @@ def __init__(self, config: FastVlmConfig): def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -193,7 +183,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | FastVlmModelOutputWithPast: @@ -262,7 +252,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, labels: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py index 500db162cbff..e2e9cc1fadf3 100644 --- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py @@ -13,6 +13,8 @@ # limitations under the License. """FastSpeech2Conformer model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="espnet/fastspeech2_conformer") +@strict(accept_kwargs=True) class FastSpeech2ConformerConfig(PreTrainedConfig): r""" encoder_num_attention_heads (`int`, *optional*, defaults to 2): @@ -143,166 +146,124 @@ class FastSpeech2ConformerConfig(PreTrainedConfig): base_config_key = "model_config" attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"} - def __init__( - self, - hidden_size=384, - vocab_size=78, - num_mel_bins=80, - encoder_num_attention_heads=2, - encoder_layers=4, - encoder_linear_units=1536, - decoder_layers=4, - decoder_num_attention_heads=2, - decoder_linear_units=1536, - speech_decoder_postnet_layers=5, - speech_decoder_postnet_units=256, - speech_decoder_postnet_kernel=5, - positionwise_conv_kernel_size=3, - encoder_normalize_before=False, - decoder_normalize_before=False, - encoder_concat_after=False, - decoder_concat_after=False, - reduction_factor=1, - speaking_speed=1.0, - use_macaron_style_in_conformer=True, - use_cnn_in_conformer=True, - encoder_kernel_size=7, - decoder_kernel_size=31, - duration_predictor_layers=2, - duration_predictor_channels=256, - duration_predictor_kernel_size=3, - energy_predictor_layers=2, - energy_predictor_channels=256, - energy_predictor_kernel_size=3, - energy_predictor_dropout=0.5, - energy_embed_kernel_size=1, - energy_embed_dropout=0.0, - stop_gradient_from_energy_predictor=False, - pitch_predictor_layers=5, - pitch_predictor_channels=256, - pitch_predictor_kernel_size=5, - pitch_predictor_dropout=0.5, - pitch_embed_kernel_size=1, - pitch_embed_dropout=0.0, - stop_gradient_from_pitch_predictor=True, - encoder_dropout_rate=0.2, - encoder_positional_dropout_rate=0.2, - encoder_attention_dropout_rate=0.2, - decoder_dropout_rate=0.2, - decoder_positional_dropout_rate=0.2, - decoder_attention_dropout_rate=0.2, - duration_predictor_dropout_rate=0.2, - speech_decoder_postnet_dropout=0.5, - max_source_positions=5000, - use_masking=True, - use_weighted_masking=False, - num_speakers=None, - num_languages=None, - speaker_embed_dim=None, - is_encoder_decoder=True, - convolution_bias=True, - **kwargs, - ): - if positionwise_conv_kernel_size % 2 == 0: + hidden_size: int = 384 + vocab_size: int = 78 + num_mel_bins: int = 80 + encoder_num_attention_heads: int = 2 + encoder_layers: int = 4 + encoder_linear_units: int = 1536 + decoder_layers: int = 4 + decoder_num_attention_heads: int = 2 + decoder_linear_units: int = 1536 + speech_decoder_postnet_layers: int = 5 + speech_decoder_postnet_units: int = 256 + speech_decoder_postnet_kernel: int = 5 + positionwise_conv_kernel_size: int = 3 + encoder_normalize_before: bool = False + decoder_normalize_before: bool = False + encoder_concat_after: bool = False + decoder_concat_after: bool = False + reduction_factor: int = 1 + speaking_speed: float = 1.0 + use_macaron_style_in_conformer: bool = True + use_cnn_in_conformer: bool = True + encoder_kernel_size: int = 7 + decoder_kernel_size: int = 31 + duration_predictor_layers: int = 2 + duration_predictor_channels: int = 256 + duration_predictor_kernel_size: int = 3 + energy_predictor_layers: int = 2 + energy_predictor_channels: int = 256 + energy_predictor_kernel_size: int = 3 + energy_predictor_dropout: float | int = 0.5 + energy_embed_kernel_size: int = 1 + energy_embed_dropout: float | int = 0.0 + stop_gradient_from_energy_predictor: bool = False + pitch_predictor_layers: int = 5 + pitch_predictor_channels: int = 256 + pitch_predictor_kernel_size: int = 5 + pitch_predictor_dropout: float | int = 0.5 + pitch_embed_kernel_size: int = 1 + pitch_embed_dropout: float | int = 0.0 + stop_gradient_from_pitch_predictor: bool = True + encoder_dropout_rate: float = 0.2 + encoder_positional_dropout_rate: float = 0.2 + encoder_attention_dropout_rate: float = 0.2 + decoder_dropout_rate: float = 0.2 + decoder_positional_dropout_rate: float = 0.2 + decoder_attention_dropout_rate: float = 0.2 + duration_predictor_dropout_rate: float = 0.2 + speech_decoder_postnet_dropout: float | int = 0.5 + max_source_positions: int = 5000 + use_masking: bool = True + use_weighted_masking: bool = False + num_speakers: int | None = None + num_languages: int | None = None + speaker_embed_dim: int | None = None + is_encoder_decoder: bool = True + convolution_bias: bool = True + + def __post_init__(self, **kwargs): + self.encoder_config = { + "num_attention_heads": self.encoder_num_attention_heads, + "layers": self.encoder_layers, + "kernel_size": self.encoder_kernel_size, + "attention_dropout_rate": self.encoder_attention_dropout_rate, + "dropout_rate": self.encoder_dropout_rate, + "positional_dropout_rate": self.encoder_positional_dropout_rate, + "linear_units": self.encoder_linear_units, + "normalize_before": self.encoder_normalize_before, + "concat_after": self.encoder_concat_after, + } + self.decoder_config = { + "num_attention_heads": self.decoder_num_attention_heads, + "layers": self.decoder_layers, + "kernel_size": self.decoder_kernel_size, + "attention_dropout_rate": self.decoder_attention_dropout_rate, + "dropout_rate": self.decoder_dropout_rate, + "positional_dropout_rate": self.decoder_positional_dropout_rate, + "linear_units": self.decoder_linear_units, + "normalize_before": self.decoder_normalize_before, + "concat_after": self.decoder_concat_after, + } + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.positionwise_conv_kernel_size % 2 == 0: raise ValueError( - f"positionwise_conv_kernel_size must be odd, but got {positionwise_conv_kernel_size} instead." + f"positionwise_conv_kernel_size must be odd, but got {self.self.positionwise_conv_kernel_size} instead." ) - if encoder_kernel_size % 2 == 0: - raise ValueError(f"encoder_kernel_size must be odd, but got {encoder_kernel_size} instead.") - if decoder_kernel_size % 2 == 0: - raise ValueError(f"decoder_kernel_size must be odd, but got {decoder_kernel_size} instead.") - if duration_predictor_kernel_size % 2 == 0: + if self.encoder_kernel_size % 2 == 0: + raise ValueError(f"encoder_kernel_size must be odd, but got {self.encoder_kernel_size} instead.") + if self.decoder_kernel_size % 2 == 0: + raise ValueError(f"decoder_kernel_size must be odd, but got {self.decoder_kernel_size} instead.") + if self.duration_predictor_kernel_size % 2 == 0: raise ValueError( - f"duration_predictor_kernel_size must be odd, but got {duration_predictor_kernel_size} instead." + f"duration_predictor_kernel_size must be odd, but got {self.duration_predictor_kernel_size} instead." ) - if energy_predictor_kernel_size % 2 == 0: + if self.energy_predictor_kernel_size % 2 == 0: raise ValueError( - f"energy_predictor_kernel_size must be odd, but got {energy_predictor_kernel_size} instead." + f"energy_predictor_kernel_size must be odd, but got {self.energy_predictor_kernel_size} instead." ) - if energy_embed_kernel_size % 2 == 0: - raise ValueError(f"energy_embed_kernel_size must be odd, but got {energy_embed_kernel_size} instead.") - if pitch_predictor_kernel_size % 2 == 0: + if self.energy_embed_kernel_size % 2 == 0: + raise ValueError(f"energy_embed_kernel_size must be odd, but got {self.energy_embed_kernel_size} instead.") + if self.pitch_predictor_kernel_size % 2 == 0: raise ValueError( - f"pitch_predictor_kernel_size must be odd, but got {pitch_predictor_kernel_size} instead." + f"pitch_predictor_kernel_size must be odd, but got {self.pitch_predictor_kernel_size} instead." ) - if pitch_embed_kernel_size % 2 == 0: - raise ValueError(f"pitch_embed_kernel_size must be odd, but got {pitch_embed_kernel_size} instead.") - if hidden_size % encoder_num_attention_heads != 0: + if self.pitch_embed_kernel_size % 2 == 0: + raise ValueError(f"pitch_embed_kernel_size must be odd, but got {self.pitch_embed_kernel_size} instead.") + if self.hidden_size % self.encoder_num_attention_heads != 0: raise ValueError("The hidden_size must be evenly divisible by encoder_num_attention_heads.") - if hidden_size % decoder_num_attention_heads != 0: + if self.hidden_size % self.decoder_num_attention_heads != 0: raise ValueError("The hidden_size must be evenly divisible by decoder_num_attention_heads.") - if use_masking and use_weighted_masking: + if self.use_masking and self.use_weighted_masking: raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.") - self.hidden_size = hidden_size - self.vocab_size = vocab_size - self.num_mel_bins = num_mel_bins - self.encoder_config = { - "num_attention_heads": encoder_num_attention_heads, - "layers": encoder_layers, - "kernel_size": encoder_kernel_size, - "attention_dropout_rate": encoder_attention_dropout_rate, - "dropout_rate": encoder_dropout_rate, - "positional_dropout_rate": encoder_positional_dropout_rate, - "linear_units": encoder_linear_units, - "normalize_before": encoder_normalize_before, - "concat_after": encoder_concat_after, - } - self.decoder_config = { - "num_attention_heads": decoder_num_attention_heads, - "layers": decoder_layers, - "kernel_size": decoder_kernel_size, - "attention_dropout_rate": decoder_attention_dropout_rate, - "dropout_rate": decoder_dropout_rate, - "positional_dropout_rate": decoder_positional_dropout_rate, - "linear_units": decoder_linear_units, - "normalize_before": decoder_normalize_before, - "concat_after": decoder_concat_after, - } - self.encoder_num_attention_heads = encoder_num_attention_heads - self.encoder_layers = encoder_layers - self.duration_predictor_channels = duration_predictor_channels - self.duration_predictor_kernel_size = duration_predictor_kernel_size - self.duration_predictor_layers = duration_predictor_layers - self.energy_embed_dropout = energy_embed_dropout - self.energy_embed_kernel_size = energy_embed_kernel_size - self.energy_predictor_channels = energy_predictor_channels - self.energy_predictor_dropout = energy_predictor_dropout - self.energy_predictor_kernel_size = energy_predictor_kernel_size - self.energy_predictor_layers = energy_predictor_layers - self.pitch_embed_dropout = pitch_embed_dropout - self.pitch_embed_kernel_size = pitch_embed_kernel_size - self.pitch_predictor_channels = pitch_predictor_channels - self.pitch_predictor_dropout = pitch_predictor_dropout - self.pitch_predictor_kernel_size = pitch_predictor_kernel_size - self.pitch_predictor_layers = pitch_predictor_layers - self.positionwise_conv_kernel_size = positionwise_conv_kernel_size - self.speech_decoder_postnet_units = speech_decoder_postnet_units - self.speech_decoder_postnet_dropout = speech_decoder_postnet_dropout - self.speech_decoder_postnet_kernel = speech_decoder_postnet_kernel - self.speech_decoder_postnet_layers = speech_decoder_postnet_layers - self.reduction_factor = reduction_factor - self.speaking_speed = speaking_speed - self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor - self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor - self.max_source_positions = max_source_positions - self.use_cnn_in_conformer = use_cnn_in_conformer - self.use_macaron_style_in_conformer = use_macaron_style_in_conformer - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - self.num_speakers = num_speakers - self.num_languages = num_languages - self.speaker_embed_dim = speaker_embed_dim - self.duration_predictor_dropout_rate = duration_predictor_dropout_rate - self.convolution_bias = convolution_bias - - super().__init__( - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) - @auto_docstring(checkpoint="espnet/fastspeech2_conformer") +@strict(accept_kwargs=True) class FastSpeech2ConformerHifiGanConfig(PreTrainedConfig): r""" model_in_dim (`int`, *optional*, defaults to 80): @@ -346,32 +307,24 @@ class FastSpeech2ConformerHifiGanConfig(PreTrainedConfig): model_type = "hifigan" base_config_key = "vocoder_config" - def __init__( - self, - model_in_dim=80, - upsample_initial_channel=512, - upsample_rates=[8, 8, 2, 2], - upsample_kernel_sizes=[16, 16, 4, 4], - resblock_kernel_sizes=[3, 7, 11], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], - initializer_range=0.01, - leaky_relu_slope=0.1, - normalize_before=True, - **kwargs, - ): - self.model_in_dim = model_in_dim - self.upsample_initial_channel = upsample_initial_channel - self.upsample_rates = upsample_rates - self.upsample_kernel_sizes = upsample_kernel_sizes - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.initializer_range = initializer_range - self.leaky_relu_slope = leaky_relu_slope - self.normalize_before = normalize_before - super().__init__(**kwargs) + model_in_dim: int = 80 + upsample_initial_channel: int = 512 + upsample_rates: list[int] | tuple[int, ...] = (8, 8, 2, 2) + upsample_kernel_sizes: list[int] | tuple[int, ...] = (16, 16, 4, 4) + resblock_kernel_sizes: list[int] | tuple[int, ...] = (3, 7, 11) + resblock_dilation_sizes: list | tuple | None = None + initializer_range: float = 0.01 + leaky_relu_slope: float = 0.1 + normalize_before: bool = True + + def __post_init__(self, **kwargs): + if self.resblock_dilation_sizes is None: + self.resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + super().__post_init__(**kwargs) @auto_docstring(checkpoint="espnet/fastspeech2_conformer") +@strict(accept_kwargs=True) class FastSpeech2ConformerWithHifiGanConfig(PreTrainedConfig): """ model_config ([`FastSpeech2ConformerConfig | dict`], *optional*): @@ -407,24 +360,23 @@ class FastSpeech2ConformerWithHifiGanConfig(PreTrainedConfig): model_type = "fastspeech2_conformer_with_hifigan" sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig} - def __init__( - self, - model_config: dict | None = None, - vocoder_config: dict | None = None, - **kwargs, - ): - if model_config is None: - model_config = {} + model_config: dict | PreTrainedConfig | None = None + vocoder_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): + if self.model_config is None: + self.model_config = FastSpeech2ConformerConfig() logger.info("model_config is None. initializing the model with default values.") + elif isinstance(self.model_config, dict): + self.model_config = FastSpeech2ConformerConfig(**self.model_config) - if vocoder_config is None: - vocoder_config = {} + if self.vocoder_config is None: + self.vocoder_config = FastSpeech2ConformerHifiGanConfig() logger.info("vocoder_config is None. initializing the coarse model with default values.") + elif isinstance(self.vocoder_config, dict): + self.vocoder_config = FastSpeech2ConformerHifiGanConfig(**self.vocoder_config) - self.model_config = FastSpeech2ConformerConfig(**model_config) - self.vocoder_config = FastSpeech2ConformerHifiGanConfig(**vocoder_config) - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["FastSpeech2ConformerConfig", "FastSpeech2ConformerHifiGanConfig", "FastSpeech2ConformerWithHifiGanConfig"] diff --git a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py index 10d4dce089f3..de07be32a115 100644 --- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py @@ -1169,7 +1169,7 @@ def forward( torch.Size([1, 49664]) ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1563,7 +1563,7 @@ def forward( torch.Size([1, 49664]) ``` """ - return_dict = return_dict if return_dict is not None else self.config.model_config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.model_config.return_dict output_attentions = ( output_attentions if output_attentions is not None else self.config.model_config.output_attentions ) diff --git a/src/transformers/models/flaubert/configuration_flaubert.py b/src/transformers/models/flaubert/configuration_flaubert.py index fe78b970456e..87085038903d 100644 --- a/src/transformers/models/flaubert/configuration_flaubert.py +++ b/src/transformers/models/flaubert/configuration_flaubert.py @@ -13,14 +13,14 @@ # limitations under the License. """Flaubert configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="flaubert/flaubert_base_uncased") +@strict(accept_kwargs=True) class FlaubertConfig(PreTrainedConfig): r""" pre_norm (`bool`, *optional*, defaults to `False`): @@ -101,86 +101,43 @@ class FlaubertConfig(PreTrainedConfig): "pad_index": "pad_token_id", } - def __init__( - self, - pre_norm=False, - layerdrop=0.0, - vocab_size=30145, - emb_dim=2048, - n_layers=12, - n_heads=16, - dropout=0.1, - attention_dropout=0.1, - gelu_activation=True, - sinusoidal_embeddings=False, - causal=False, - asm=False, - n_langs=1, - use_lang_emb=True, - max_position_embeddings=512, - embed_init_std=2048**-0.5, - layer_norm_eps=1e-12, - init_std=0.02, - bos_index=0, - eos_index=1, - pad_index=2, - unk_index=3, - mask_index=5, - is_encoder=True, - summary_type="first", - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, - start_n_top=5, - end_n_top=5, - mask_token_id=0, - lang_id=0, - pad_token_id=2, - bos_token_id=0, - eos_token_id=1, - tie_word_embeddings=True, - **kwargs, - ): - self.pre_norm = pre_norm - self.layerdrop = layerdrop - self.vocab_size = vocab_size - self.emb_dim = emb_dim - self.n_layers = n_layers - self.n_heads = n_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.gelu_activation = gelu_activation - self.sinusoidal_embeddings = sinusoidal_embeddings - self.causal = causal - self.asm = asm - self.n_langs = n_langs - self.use_lang_emb = use_lang_emb - self.layer_norm_eps = layer_norm_eps - self.unk_index = unk_index - self.mask_index = mask_index - self.is_encoder = is_encoder - self.max_position_embeddings = max_position_embeddings - self.embed_init_std = embed_init_std - self.init_std = init_std - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_proj_to_labels = summary_proj_to_labels - self.summary_first_dropout = summary_first_dropout - self.start_n_top = start_n_top - self.end_n_top = end_n_top - self.mask_token_id = mask_token_id - self.lang_id = lang_id - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - if "n_words" in kwargs: - self.n_words = kwargs["n_words"] - - super().__init__(**kwargs) + pre_norm: bool = False + layerdrop: float | int = 0.0 + vocab_size: int = 30145 + emb_dim: int = 2048 + n_layers: int = 12 + n_heads: int = 16 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + gelu_activation: bool = True + sinusoidal_embeddings: bool = False + causal: bool = False + asm: bool = False + n_langs: int = 1 + use_lang_emb: bool = True + max_position_embeddings: int = 512 + embed_init_std: float = 2048**-0.5 + layer_norm_eps: float = 1e-12 + init_std: float = 0.02 + bos_index: int = 0 + eos_index: int = 1 + pad_index: int = 2 + unk_index: int = 3 + mask_index: int = 5 + is_encoder: bool = True + summary_type: str = "first" + summary_use_proj: bool = True + summary_activation: str | None = None + summary_proj_to_labels: bool = True + summary_first_dropout: float | int = 0.1 + start_n_top: int = 5 + end_n_top: int = 5 + mask_token_id: int = 0 + lang_id: int = 0 + pad_token_id: int | None = 2 + bos_token_id: int | None = 0 + eos_token_id: int | None = 1 + tie_word_embeddings: bool = True __all__ = ["FlaubertConfig"] diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py index 2ed66c2e49a1..eaa9ecd9c980 100644 --- a/src/transformers/models/flaubert/modeling_flaubert.py +++ b/src/transformers/models/flaubert/modeling_flaubert.py @@ -812,7 +812,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # removed: src_enc=None, src_len=None if input_ids is not None: @@ -1023,7 +1023,7 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1110,7 +1110,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1214,7 +1214,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, @@ -1304,7 +1304,7 @@ def forward( Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential decoding. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1468,7 +1468,7 @@ def forward( >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) >>> loss = outputs.loss ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1586,7 +1586,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py index 1c64e83f7912..06594132182b 100644 --- a/src/transformers/models/flava/configuration_flava.py +++ b/src/transformers/models/flava/configuration_flava.py @@ -15,6 +15,8 @@ from typing import Any +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="facebook/flava-full") +@strict(accept_kwargs=True) class FlavaImageConfig(PreTrainedConfig): r""" num_blocks_per_group (`int`, *optional*, defaults to 2): @@ -50,45 +53,25 @@ class FlavaImageConfig(PreTrainedConfig): model_type = "flava_image_model" base_config_key = "image_config" - def __init__( - self, - hidden_size: int = 768, - num_hidden_layers: int = 12, - num_attention_heads: int = 12, - intermediate_size: int = 3072, - hidden_act: int = "gelu", - hidden_dropout_prob: float = 0.0, - attention_probs_dropout_prob: float = 0.0, - initializer_range: float = 0.02, - layer_norm_eps: float = 1e-12, - image_size: int = 224, - patch_size: int = 16, - num_channels: int = 3, - qkv_bias: bool = True, - mask_token: bool = True, - vocab_size: int = 8192, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.mask_token = mask_token - self.vocab_size = vocab_size + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + mask_token: bool = True + vocab_size: int = 8192 @auto_docstring(checkpoint="facebook/flava-full") +@strict(accept_kwargs=True) class FlavaTextConfig(PreTrainedConfig): r""" Example: @@ -109,43 +92,24 @@ class FlavaTextConfig(PreTrainedConfig): model_type = "flava_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size: int = 30522, - type_vocab_size: int = 2, - max_position_embeddings: int = 512, - hidden_size: int = 768, - num_hidden_layers: int = 12, - num_attention_heads: int = 12, - intermediate_size: int = 3072, - hidden_act: str = "gelu", - hidden_dropout_prob: float = 0.0, - attention_probs_dropout_prob: float = 0.0, - initializer_range: float = 0.02, - layer_norm_eps: float = 1e-12, - pad_token_id: int = 0, - qkv_bias: bool = True, - **kwargs, - ): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.type_vocab_size = type_vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.qkv_bias = qkv_bias - self.pad_token_id = pad_token_id + vocab_size: int = 30522 + type_vocab_size: int = 2 + max_position_embeddings: int = 512 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + qkv_bias: bool = True @auto_docstring(checkpoint="facebook/flava-full") +@strict(accept_kwargs=True) class FlavaMultimodalConfig(PreTrainedConfig): r""" use_cls_token (`bool`, *optional*, defaults to `True`): @@ -169,37 +133,21 @@ class FlavaMultimodalConfig(PreTrainedConfig): model_type = "flava_multimodal_model" base_config_key = "multimodal_config" - def __init__( - self, - hidden_size: int = 768, - num_hidden_layers: int = 6, - num_attention_heads: int = 12, - intermediate_size: int = 3072, - hidden_act: int = "gelu", - hidden_dropout_prob: int = 0.0, - attention_probs_dropout_prob: int = 0.0, - initializer_range: float = 0.02, - layer_norm_eps: float = 1e-12, - qkv_bias: bool = True, - use_cls_token: bool = True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.qkv_bias = qkv_bias - self.use_cls_token = use_cls_token + hidden_size: int = 768 + num_hidden_layers: int = 6 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + qkv_bias: bool = True + use_cls_token: bool = True @auto_docstring(checkpoint="facebook/flava-full") +@strict(accept_kwargs=True) class FlavaImageCodebookConfig(PreTrainedConfig): r""" num_groups (`int`, *optional*, defaults to 4): @@ -225,31 +173,17 @@ class FlavaImageCodebookConfig(PreTrainedConfig): ``` """ - model_type = "flava_image_codebook" - base_config_key = "image_codebook_config" - - def __init__( - self, - num_groups: int = 4, - input_channels: int = 3, - num_blocks_per_group: int = 2, - hidden_size: int = 256, - vocab_size: int = 8192, - freeze: int = True, - initializer_range: float = 0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.num_groups = num_groups - self.input_channels = input_channels - self.num_blocks_per_group = num_blocks_per_group - self.hidden_size = hidden_size - self.vocab_size = vocab_size - self.freeze = freeze - self.initializer_range = initializer_range + num_groups: int = 4 + input_channels: int = 3 + num_blocks_per_group: int = 2 + hidden_size: int = 256 + vocab_size: int = 8192 + freeze: bool = True + initializer_range: float = 0.02 @auto_docstring(checkpoint="facebook/flava-full") +@strict(accept_kwargs=True) class FlavaConfig(PreTrainedConfig): r""" image_config (`dict`, *optional*): @@ -310,34 +244,65 @@ class FlavaConfig(PreTrainedConfig): "image_codebook_config": FlavaImageCodebookConfig, } - def __init__( - self, - image_config: dict[str, Any] | None = None, - text_config: dict[str, Any] | None = None, - multimodal_config: dict[str, Any] | None = None, - image_codebook_config: dict[str, Any] | None = None, - hidden_size: int = 768, - layer_norm_eps: float = 1e-12, - projection_dim: int = 768, - init_codebook: bool = True, - logit_scale_init_value: float = 2.6592, - initializer_range: float = 0.02, - ce_ignore_index: int = -100, - mim_weight: float = 1.0, - mlm_weight: float = 1.0, - global_contrastive_weight: float = 1.0, - itm_weight: float = 1.0, - mmm_image_weight: float = 1.0, - mmm_text_weight: float = 1.0, - global_backprop_contrastive: bool = True, - skip_unmasked_multimodal_encoder: bool = True, - return_loss: bool = True, - tie_word_embeddings: bool | None = True, - **kwargs, - ): + image_config: dict[str, Any] | PreTrainedConfig | None = None + text_config: dict[str, Any] | PreTrainedConfig | None = None + multimodal_config: dict[str, Any] | PreTrainedConfig | None = None + image_codebook_config: dict[str, Any] | PreTrainedConfig | None = None + hidden_size: int = 768 + layer_norm_eps: float = 1e-12 + projection_dim: int = 768 + init_codebook: bool = True + logit_scale_init_value: float = 2.6592 + initializer_range: float = 0.02 + ce_ignore_index: int = -100 + mim_weight: float = 1.0 + mlm_weight: float = 1.0 + global_contrastive_weight: float = 1.0 + itm_weight: float = 1.0 + mmm_image_weight: float = 1.0 + mmm_text_weight: float = 1.0 + global_backprop_contrastive: bool = True + skip_unmasked_multimodal_encoder: bool = True + return_loss: bool = True + tie_word_embeddings: bool = True + initializer_factor: float = 1.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `FlavaTextConfig` with default values.") + elif isinstance(self.text_config, FlavaTextConfig): + text_config = self.text_config.to_dict() + else: + text_config = self.text_config + + if self.image_config is None: + image_config = {} + logger.info("`image_config` is `None`. initializing the `FlavaImageConfig` with default values.") + elif isinstance(self.image_config, FlavaImageConfig): + image_config = self.image_config.to_dict() + else: + image_config = self.image_config + + if self.multimodal_config is None: + multimodal_config = {} + logger.info("`multimodal_config` is `None`. Initializing the `FlavaMultimodalConfig` with default values.") + elif isinstance(self.multimodal_config, FlavaMultimodalConfig): + multimodal_config = self.multimodal_config.to_dict() + else: + multimodal_config = self.multimodal_config + + if self.image_codebook_config is None: + image_codebook_config = {} + logger.info( + "`image_codebook_config` is `None`. initializing the `FlavaImageCodebookConfig` with default values." + ) + elif isinstance(self.image_codebook_config, FlavaImageCodebookConfig): + image_codebook_config = self.image_codebook_config.to_dict() + else: + image_codebook_config = self.image_codebook_config + # If `_config_dict` exist, we use them for the backward compatibility. - # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot - # of confusion!). text_config_dict = kwargs.pop("text_config_dict", None) image_config_dict = kwargs.pop("image_config_dict", None) multimodal_config_dict = kwargs.pop("multimodal_config_dict", None) @@ -347,9 +312,6 @@ def __init__( # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. if text_config_dict is not None: - if text_config is None: - text_config = {} - # This is the complete result when using `text_config_dict`. _text_config_dict = FlavaTextConfig(**text_config_dict).to_dict() @@ -374,9 +336,6 @@ def __init__( text_config.update(_text_config_dict) if image_config_dict is not None: - if image_config is None: - image_config = {} - # This is the complete result when using `image_config_dict`. _image_config_dict = FlavaImageConfig(**image_config_dict).to_dict() # convert keys to string instead of integer @@ -406,9 +365,6 @@ def __init__( image_config.update(_image_config_dict) if multimodal_config_dict is not None: - if multimodal_config is None: - multimodal_config = {} - # This is the complete result when using `multimodal_config_dict`. _multimodal_config_dict = FlavaMultimodalConfig(**multimodal_config_dict).to_dict() @@ -434,9 +390,6 @@ def __init__( multimodal_config.update(_multimodal_config_dict) if image_codebook_config_dict is not None: - if image_codebook_config is None: - image_codebook_config = {} - # This is the complete result when using `image_codebook_config_dict`. _image_codebook_config_dict = FlavaImageCodebookConfig(**image_codebook_config_dict).to_dict() @@ -466,55 +419,13 @@ def __init__( # Update all values in `image_codebook_config` with the ones in `_image_codebook_config_dict`. image_codebook_config.update(_image_codebook_config_dict) - if text_config is None: - text_config = FlavaTextConfig() - logger.info("`text_config` is `None`. initializing the `FlavaTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = FlavaTextConfig(**text_config) + # Finally we can convert back our unified text/vision configs to `PretrainedConfig` + self.text_config = FlavaTextConfig(**text_config) + self.image_config = FlavaImageConfig(**image_config) + self.multimodal_config = FlavaMultimodalConfig(**multimodal_config) + self.image_codebook_config = FlavaImageCodebookConfig(**image_codebook_config) - if image_config is None: - image_config = FlavaImageConfig() - logger.info("`image_config` is `None`. initializing the `FlavaImageConfig` with default values.") - elif isinstance(image_config, dict): - image_config = FlavaImageConfig(**image_config) - - if multimodal_config is None: - multimodal_config = FlavaMultimodalConfig() - logger.info("`image_config` is `None`. initializing the `FlavaMultimodalConfig` with default values.") - elif isinstance(multimodal_config, dict): - multimodal_config = FlavaMultimodalConfig(**multimodal_config) - - if image_codebook_config is None: - image_codebook_config = FlavaImageCodebookConfig() - logger.info("`image_config` is `None`. initializing the `FlavaImageCodebookConfig` with default values.") - elif isinstance(image_codebook_config, dict): - image_codebook_config = FlavaImageCodebookConfig(**image_codebook_config) - - self.text_config = text_config - self.image_config = image_config - self.multimodal_config = multimodal_config - self.image_codebook_config = image_codebook_config - - self.projection_dim = projection_dim - self.init_codebook = init_codebook - - self.hidden_size = hidden_size - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = 1.0 - self.ce_ignore_index = ce_ignore_index - self.mim_weight = mim_weight - self.mlm_weight = mlm_weight - self.global_contrastive_weight = global_contrastive_weight - self.itm_weight = itm_weight - self.mmm_image_weight = mmm_image_weight - self.mmm_text_weight = mmm_text_weight - self.global_backprop_contrastive = global_backprop_contrastive - self.skip_unmasked_multimodal_encoder = skip_unmasked_multimodal_encoder - self.return_loss = return_loss - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["FlavaConfig", "FlavaImageCodebookConfig", "FlavaImageConfig", "FlavaMultimodalConfig", "FlavaTextConfig"] diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index 264528def22e..49e3b0f3f73c 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -337,7 +337,7 @@ class PatchEmbeddings(nn.Module): def __init__( self, - image_size: int = 224, + image_size: int | list[int] | tuple[int, int] = 224, patch_size: int | tuple[int, int] = 16, num_channels: int = 3, embed_dim: int = 768, @@ -738,7 +738,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -826,7 +826,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is None: raise ValueError("You have to specify input_ids") @@ -912,7 +912,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict batch_size, seq_length, _ = hidden_states.size() @@ -1666,7 +1666,7 @@ def forward( >>> output = model(**inputs) ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict return_loss = return_loss if return_loss is not None else self.config.return_loss skip_unmasked_multimodal_encoder = ( diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index ba0e7ccc0b2f..c8ea40793292 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -19,12 +19,15 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="allenai/FlexOlmo-7x7B-1T") +@strict(accept_kwargs=True) class FlexOlmoConfig(PreTrainedConfig): r""" Example: @@ -61,63 +64,34 @@ class FlexOlmoConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 100352, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-06, - use_cache: bool | None = True, - pad_token_id: int | None = 100277, - bos_token_id: int | None = None, - eos_token_id: int | None = 100257, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - num_experts_per_tok: int | None = 5, - num_experts: int | None = 7, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.01, - norm_topk_prob: bool | None = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.norm_topk_prob = norm_topk_prob - self.rope_parameters = rope_parameters + vocab_size: int = 100352 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 4096 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-06 + use_cache: bool = True + pad_token_id: int | None = 100277 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = 100257 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | None = 0.0 + num_experts_per_tok: int = 5 + num_experts: int = 7 + output_router_logits: bool = False + router_aux_loss_coef: float = 0.01 + norm_topk_prob: bool = False - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["FlexOlmoConfig"] diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index 92fc900d1748..477eb4112194 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -14,6 +14,7 @@ import torch +from huggingface_hub.dataclasses import strict from ...cache_utils import Cache, DynamicCache from ...configuration_utils import PreTrainedConfig @@ -36,6 +37,7 @@ @auto_docstring(checkpoint="allenai/FlexOlmo-7x7B-1T") +@strict(accept_kwargs=True) class FlexOlmoConfig(PreTrainedConfig): r""" Example: @@ -72,63 +74,34 @@ class FlexOlmoConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 100352, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-06, - use_cache: bool | None = True, - pad_token_id: int | None = 100277, - bos_token_id: int | None = None, - eos_token_id: int | None = 100257, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - num_experts_per_tok: int | None = 5, - num_experts: int | None = 7, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.01, - norm_topk_prob: bool | None = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.norm_topk_prob = norm_topk_prob - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 100352 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 4096 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-06 + use_cache: bool = True + pad_token_id: int | None = 100277 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = 100257 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | None = 0.0 + num_experts_per_tok: int = 5 + num_experts: int = 7 + output_router_logits: bool = False + router_aux_loss_coef: float = 0.01 + norm_topk_prob: bool = False + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) # FlexOlmo RMS norm reuses Olmo2 RMS norm, which handles low precision slightly differently than the original Olmoe. diff --git a/src/transformers/models/florence2/configuration_florence2.py b/src/transformers/models/florence2/configuration_florence2.py index c960942184a7..ea06027529c3 100644 --- a/src/transformers/models/florence2/configuration_florence2.py +++ b/src/transformers/models/florence2/configuration_florence2.py @@ -17,6 +17,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -26,6 +28,7 @@ @auto_docstring(checkpoint="florence-community/Florence-2-base") +@strict(accept_kwargs=True) class Florence2VisionConfig(PreTrainedConfig): r""" window_size (`int`, *optional*, defaults to 12): @@ -60,51 +63,28 @@ class Florence2VisionConfig(PreTrainedConfig): model_type = "florence_vision" - def __init__( - self, - in_channels=3, - depths=(1, 1, 9, 1), - patch_size=(7, 3, 3, 3), - patch_stride=(4, 2, 2, 2), - patch_padding=(3, 1, 1, 1), - patch_prenorm=(False, True, True, True), - embed_dim=(128, 256, 512, 1024), - num_heads=(4, 8, 16, 32), - num_groups=(4, 8, 16, 32), - window_size=12, - drop_path_rate=0.1, - mlp_ratio=4.0, - qkv_bias=True, - activation_function="gelu", - projection_dim=1024, - max_temporal_embeddings=100, - max_position_embeddings=50, - initializer_range=0.02, - **kwargs, - ): - self.in_channels = in_channels - self.depths = list(depths) - self.patch_size = list(patch_size) - self.patch_stride = list(patch_stride) - self.patch_padding = list(patch_padding) - self.patch_prenorm = list(patch_prenorm) - self.embed_dim = list(embed_dim) - self.num_heads = list(num_heads) - self.num_groups = list(num_groups) - self.window_size = window_size - self.drop_path_rate = drop_path_rate - self.mlp_ratio = mlp_ratio - self.qkv_bias = qkv_bias - self.projection_dim = projection_dim - self.max_temporal_embeddings = max_temporal_embeddings - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.activation_function = activation_function - - super().__init__(**kwargs) + in_channels: int = 3 + depths: list[int] | tuple[int, ...] = (1, 1, 9, 1) + patch_size: list[int] | tuple[int, ...] = (7, 3, 3, 3) + patch_stride: list[int] | tuple[int, ...] = (4, 2, 2, 2) + patch_padding: list[int] | tuple[int, ...] = (3, 1, 1, 1) + patch_prenorm: list[bool] | tuple[bool, ...] = (False, True, True, True) + embed_dim: list[int] | tuple[int, ...] = (128, 256, 512, 1024) + num_heads: list[int] | tuple[int, ...] = (4, 8, 16, 32) + num_groups: list[int] | tuple[int, ...] = (4, 8, 16, 32) + window_size: int = 12 + drop_path_rate: float = 0.1 + mlp_ratio: float = 4.0 + qkv_bias: bool = True + activation_function: str = "gelu" + projection_dim: int = 1024 + max_temporal_embeddings: int = 100 + max_position_embeddings: int = 50 + initializer_range: float = 0.02 @auto_docstring(checkpoint="florence-community/Florence-2-base") +@strict(accept_kwargs=True) class Florence2Config(PreTrainedConfig): r""" Example: @@ -134,36 +114,26 @@ class Florence2Config(PreTrainedConfig): "vision_config": Florence2VisionConfig, } - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=51289, - is_encoder_decoder=True, - tie_word_embeddings=True, - **kwargs, - ): - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "bart") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["bart"]() - - if isinstance(vision_config, dict): - vision_config = Florence2VisionConfig(**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 51289 + is_encoder_decoder: bool = True + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "bart") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["bart"]() + + if isinstance(self.vision_config, dict): + self.vision_config = Florence2VisionConfig(**self.vision_config) + elif self.vision_config is None: logger.info("vision_config is None. Initializing the Florence2VisionConfig with default values.") - vision_config = Florence2VisionConfig() - - self.text_config = text_config - self.vision_config = vision_config - self.image_token_id = image_token_id - self.tie_word_embeddings = tie_word_embeddings + self.vision_config = Florence2VisionConfig() - super().__init__( - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) + super().__post_init__(**kwargs) __all__ = ["Florence2Config", "Florence2VisionConfig"] diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py index 6aa1bcc6dfc5..334870730b53 100644 --- a/src/transformers/models/florence2/modular_florence2.py +++ b/src/transformers/models/florence2/modular_florence2.py @@ -20,6 +20,7 @@ import numpy as np import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...activations import ACT2FN @@ -49,6 +50,7 @@ @auto_docstring(checkpoint="florence-community/Florence-2-base") +@strict(accept_kwargs=True) class Florence2VisionConfig(PreTrainedConfig): r""" window_size (`int`, *optional*, defaults to 12): @@ -83,51 +85,28 @@ class Florence2VisionConfig(PreTrainedConfig): model_type = "florence_vision" - def __init__( - self, - in_channels=3, - depths=(1, 1, 9, 1), - patch_size=(7, 3, 3, 3), - patch_stride=(4, 2, 2, 2), - patch_padding=(3, 1, 1, 1), - patch_prenorm=(False, True, True, True), - embed_dim=(128, 256, 512, 1024), - num_heads=(4, 8, 16, 32), - num_groups=(4, 8, 16, 32), - window_size=12, - drop_path_rate=0.1, - mlp_ratio=4.0, - qkv_bias=True, - activation_function="gelu", - projection_dim=1024, - max_temporal_embeddings=100, - max_position_embeddings=50, - initializer_range=0.02, - **kwargs, - ): - self.in_channels = in_channels - self.depths = list(depths) - self.patch_size = list(patch_size) - self.patch_stride = list(patch_stride) - self.patch_padding = list(patch_padding) - self.patch_prenorm = list(patch_prenorm) - self.embed_dim = list(embed_dim) - self.num_heads = list(num_heads) - self.num_groups = list(num_groups) - self.window_size = window_size - self.drop_path_rate = drop_path_rate - self.mlp_ratio = mlp_ratio - self.qkv_bias = qkv_bias - self.projection_dim = projection_dim - self.max_temporal_embeddings = max_temporal_embeddings - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.activation_function = activation_function - - super().__init__(**kwargs) + in_channels: int = 3 + depths: list[int] | tuple[int, ...] = (1, 1, 9, 1) + patch_size: list[int] | tuple[int, ...] = (7, 3, 3, 3) + patch_stride: list[int] | tuple[int, ...] = (4, 2, 2, 2) + patch_padding: list[int] | tuple[int, ...] = (3, 1, 1, 1) + patch_prenorm: list[bool] | tuple[bool, ...] = (False, True, True, True) + embed_dim: list[int] | tuple[int, ...] = (128, 256, 512, 1024) + num_heads: list[int] | tuple[int, ...] = (4, 8, 16, 32) + num_groups: list[int] | tuple[int, ...] = (4, 8, 16, 32) + window_size: int = 12 + drop_path_rate: float = 0.1 + mlp_ratio: float = 4.0 + qkv_bias: bool = True + activation_function: str = "gelu" + projection_dim: int = 1024 + max_temporal_embeddings: int = 100 + max_position_embeddings: int = 50 + initializer_range: float = 0.02 @auto_docstring(checkpoint="florence-community/Florence-2-base") +@strict(accept_kwargs=True) class Florence2Config(PreTrainedConfig): r""" Example: @@ -157,36 +136,26 @@ class Florence2Config(PreTrainedConfig): "vision_config": Florence2VisionConfig, } - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=51289, - is_encoder_decoder=True, - tie_word_embeddings=True, - **kwargs, - ): - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "bart") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["bart"]() - - if isinstance(vision_config, dict): - vision_config = Florence2VisionConfig(**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 51289 + is_encoder_decoder: bool = True + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "bart") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["bart"]() + + if isinstance(self.vision_config, dict): + self.vision_config = Florence2VisionConfig(**self.vision_config) + elif self.vision_config is None: logger.info("vision_config is None. Initializing the Florence2VisionConfig with default values.") - vision_config = Florence2VisionConfig() - - self.text_config = text_config - self.vision_config = vision_config - self.image_token_id = image_token_id - self.tie_word_embeddings = tie_word_embeddings + self.vision_config = Florence2VisionConfig() - super().__init__( - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) + super().__post_init__(**kwargs) class Florence2ProcessorKwargs(LlavaProcessorKwargs): diff --git a/src/transformers/models/fnet/configuration_fnet.py b/src/transformers/models/fnet/configuration_fnet.py index f81194414371..90586bc6f91c 100644 --- a/src/transformers/models/fnet/configuration_fnet.py +++ b/src/transformers/models/fnet/configuration_fnet.py @@ -13,14 +13,14 @@ # limitations under the License. """FNet model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/fnet-base") +@strict(accept_kwargs=True) class FNetConfig(PreTrainedConfig): r""" use_tpu_fourier_optimizations (`bool`, *optional*, defaults to `False`): @@ -48,44 +48,22 @@ class FNetConfig(PreTrainedConfig): model_type = "fnet" - def __init__( - self, - vocab_size=32000, - hidden_size=768, - num_hidden_layers=12, - intermediate_size=3072, - hidden_act="gelu_new", - hidden_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=4, - initializer_range=0.02, - layer_norm_eps=1e-12, - use_tpu_fourier_optimizations=False, - tpu_short_seq_length=512, - pad_token_id=3, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.initializer_range = initializer_range - self.type_vocab_size = type_vocab_size - self.layer_norm_eps = layer_norm_eps - self.use_tpu_fourier_optimizations = use_tpu_fourier_optimizations - self.tpu_short_seq_length = tpu_short_seq_length + vocab_size: int = 32000 + hidden_size: int = 768 + num_hidden_layers: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu_new" + hidden_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 4 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + use_tpu_fourier_optimizations: bool = False + tpu_short_seq_length: int = 512 + pad_token_id: int | None = 3 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + tie_word_embeddings: bool = True __all__ = ["FNetConfig"] diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index 2fe0823dd8fe..c5b08bd94050 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -448,7 +448,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -572,7 +572,7 @@ def forward( >>> prediction_logits = outputs.prediction_logits >>> seq_relationship_logits = outputs.seq_relationship_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.fnet( input_ids, @@ -646,7 +646,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.fnet( input_ids, @@ -723,7 +723,7 @@ def forward( >>> assert logits[0, 0] < logits[0, 1] # next sentence was random ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.fnet( input_ids, @@ -790,7 +790,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.fnet( input_ids, @@ -888,7 +888,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -957,7 +957,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.fnet( input_ids, @@ -1012,7 +1012,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.fnet( input_ids, diff --git a/src/transformers/models/focalnet/configuration_focalnet.py b/src/transformers/models/focalnet/configuration_focalnet.py index 799a5ad788fd..74c18910d4ba 100644 --- a/src/transformers/models/focalnet/configuration_focalnet.py +++ b/src/transformers/models/focalnet/configuration_focalnet.py @@ -13,15 +13,15 @@ # limitations under the License. """FocalNet model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/focalnet-tiny") +@strict(accept_kwargs=True) class FocalNetConfig(BackboneConfigMixin, PreTrainedConfig): r""" use_conv_embed (`bool`, *optional*, defaults to `False`): @@ -63,58 +63,36 @@ class FocalNetConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "focalnet" - def __init__( - self, - image_size=224, - patch_size=4, - num_channels=3, - embed_dim=96, - use_conv_embed=False, - hidden_sizes=[192, 384, 768, 768], - depths=[2, 2, 6, 2], - focal_levels=[2, 2, 2, 2], - focal_windows=[3, 3, 3, 3], - hidden_act="gelu", - mlp_ratio=4.0, - hidden_dropout_prob=0.0, - drop_path_rate=0.1, - use_layerscale=False, - layerscale_value=1e-4, - use_post_layernorm=False, - use_post_layernorm_in_modulation=False, - normalize_modulator=False, - initializer_range=0.02, - layer_norm_eps=1e-5, - encoder_stride=32, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 4 + num_channels: int = 3 + embed_dim: int = 96 + use_conv_embed: bool = False + hidden_sizes: list[int] | tuple[int, ...] = (192, 384, 768, 768) + depths: list[int] | tuple[int, ...] = (2, 2, 6, 2) + focal_levels: list[int] | tuple[int, ...] = (2, 2, 2, 2) + focal_windows: list[int] | tuple[int, ...] = (3, 3, 3, 3) + hidden_act: str = "gelu" + mlp_ratio: float = 4.0 + hidden_dropout_prob: float = 0.0 + drop_path_rate: float = 0.1 + use_layerscale: bool = False + layerscale_value: float = 1e-4 + use_post_layernorm: bool = False + use_post_layernorm_in_modulation: bool = False + normalize_modulator: bool = False + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + encoder_stride: int = 32 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.embed_dim = embed_dim - self.use_conv_embed = use_conv_embed - self.hidden_sizes = hidden_sizes - self.depths = depths - self.focal_levels = focal_levels - self.focal_windows = focal_windows - self.hidden_act = hidden_act - self.mlp_ratio = mlp_ratio - self.hidden_dropout_prob = hidden_dropout_prob - self.drop_path_rate = drop_path_rate - self.use_layerscale = use_layerscale - self.layerscale_value = layerscale_value - self.use_post_layernorm = use_post_layernorm - self.use_post_layernorm_in_modulation = use_post_layernorm_in_modulation - self.normalize_modulator = normalize_modulator - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.encoder_stride = encoder_stride + def __post_init__(self, **kwargs): self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["FocalNetConfig"] diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py index 42c388af5cc7..b4e03c8884d5 100644 --- a/src/transformers/models/focalnet/modeling_focalnet.py +++ b/src/transformers/models/focalnet/modeling_focalnet.py @@ -636,7 +636,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -742,7 +742,7 @@ def forward( >>> list(reconstructed_pixel_values.shape) [1, 3, 192, 192] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.focalnet( pixel_values, @@ -823,7 +823,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.focalnet( pixel_values, @@ -898,7 +898,7 @@ def forward( >>> inputs = processor(image, return_tensors="pt") >>> outputs = model(**inputs) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py index 769e8fee9797..e889e65a5219 100644 --- a/src/transformers/models/fsmt/configuration_fsmt.py +++ b/src/transformers/models/fsmt/configuration_fsmt.py @@ -13,24 +13,14 @@ # limitations under the License. """FSMT configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from huggingface_hub.dataclasses import strict - -class DecoderConfig(PreTrainedConfig): - model_type = "fsmt_decoder" - - def __init__(self, vocab_size=0, bos_token_id=0, is_encoder_decoder=True, **kwargs): - super().__init__(**kwargs) - self.vocab_size = vocab_size - self.bos_token_id = bos_token_id - self.is_encoder_decoder = is_encoder_decoder +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/wmt19-en-ru") +@strict(accept_kwargs=True) class FSMTConfig(PreTrainedConfig): r""" langs (`list[str]`): @@ -75,85 +65,44 @@ class FSMTConfig(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model", "vocab_size": "tgt_vocab_size", + "num_hidden_layers": "encoder_layers", } - # update the defaults from config file - def __init__( - self, - langs=["en", "de"], - src_vocab_size=42024, - tgt_vocab_size=42024, - activation_function="relu", - d_model=1024, - max_length=200, - max_position_embeddings=1024, - encoder_ffn_dim=4096, - encoder_layers=12, - encoder_attention_heads=16, - encoder_layerdrop=0.0, - decoder_ffn_dim=4096, - decoder_layers=12, - decoder_attention_heads=16, - decoder_layerdrop=0.0, - attention_dropout=0.0, - dropout=0.1, - activation_dropout=0.0, - init_std=0.02, - decoder_start_token_id=2, - is_encoder_decoder=True, - scale_embedding=True, - tie_word_embeddings=False, - num_beams=5, - length_penalty=1.0, - early_stopping=False, - use_cache=True, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - forced_eos_token_id=2, - **common_kwargs, - ): - self.langs = langs - self.src_vocab_size = src_vocab_size - self.tgt_vocab_size = tgt_vocab_size - self.d_model = d_model # encoder_embed_dim and decoder_embed_dim - - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = self.num_hidden_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.max_position_embeddings = max_position_embeddings - self.init_std = init_std # Normal(0, this parameter) - self.activation_function = activation_function - - common_kwargs.pop("decoder", None) # delete unused kwargs - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - - # 3 Types of Dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.dropout = dropout - - self.use_cache = use_cache - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__( - is_encoder_decoder=is_encoder_decoder, - forced_eos_token_id=forced_eos_token_id, - max_length=max_length, - num_beams=num_beams, - length_penalty=length_penalty, - early_stopping=early_stopping, - **common_kwargs, - ) + langs: list[str] | tuple[str, ...] = ("en", "de") + src_vocab_size: int = 42024 + tgt_vocab_size: int = 42024 + activation_function: str = "relu" + d_model: int = 1024 + max_length: int = 200 + max_position_embeddings: int = 1024 + encoder_ffn_dim: int = 4096 + encoder_layers: int = 12 + encoder_attention_heads: int = 16 + encoder_layerdrop: float | int = 0.0 + decoder_ffn_dim: int = 4096 + decoder_layers: int = 12 + decoder_attention_heads: int = 16 + decoder_layerdrop: float | int = 0.0 + attention_dropout: float | int = 0.0 + dropout: float | int = 0.1 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + decoder_start_token_id: int | None = 2 + is_encoder_decoder: bool = True + scale_embedding: bool = True + tie_word_embeddings: bool = False + num_beams: int = 5 + length_penalty: float = 1.0 + early_stopping: bool = False + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + forced_eos_token_id: int | None = 2 + + def __post_init__(self, **kwargs): + kwargs.pop("decoder", None) # delete unused kwargs + super().__post_init__(**kwargs) __all__ = ["FSMTConfig"] diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index 9572b8e0723b..8bbc0a592b1c 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -867,7 +867,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # make masks if user doesn't supply if not use_cache and input_ids is not None: @@ -1017,7 +1017,7 @@ def forward( "Machine learning is great, isn't it?" ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: use_cache = False diff --git a/src/transformers/models/funnel/configuration_funnel.py b/src/transformers/models/funnel/configuration_funnel.py index faf365f8b0bc..195ec081f8b0 100644 --- a/src/transformers/models/funnel/configuration_funnel.py +++ b/src/transformers/models/funnel/configuration_funnel.py @@ -13,14 +13,14 @@ # limitations under the License. """Funnel Transformer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="funnel-transformer/small") +@strict(accept_kwargs=True) class FunnelConfig(PreTrainedConfig): r""" block_sizes (`list[int]`, *optional*, defaults to `[4, 4, 4]`): @@ -49,67 +49,49 @@ class FunnelConfig(PreTrainedConfig): "num_attention_heads": "n_head", } - def __init__( - self, - vocab_size=30522, - block_sizes=[4, 4, 4], - block_repeats=None, - num_decoder_layers=2, - d_model=768, - n_head=12, - d_head=64, - d_inner=3072, - hidden_act="gelu_new", - hidden_dropout=0.1, - attention_dropout=0.1, - activation_dropout=0.0, - initializer_range=0.1, - initializer_std=None, - layer_norm_eps=1e-9, - pooling_type="mean", - attention_type="relative_shift", - separate_cls=True, - truncate_seq=True, - pool_q_only=True, - pad_token_id=None, - tie_word_embeddings=True, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.block_sizes = block_sizes - self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats - assert len(block_sizes) == len(self.block_repeats), ( - "`block_sizes` and `block_repeats` should have the same length." - ) - self.num_decoder_layers = num_decoder_layers - self.d_model = d_model - self.n_head = n_head - self.d_head = d_head - self.d_inner = d_inner - self.hidden_act = hidden_act - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.initializer_range = initializer_range - self.initializer_std = initializer_std - self.layer_norm_eps = layer_norm_eps - assert pooling_type in [ + vocab_size: int = 30522 + block_sizes: list[int] | tuple[int, ...] = (4, 4, 4) + block_repeats: list[int] | None = None + num_decoder_layers: int = 2 + d_model: int = 768 + n_head: int = 12 + d_head: int = 64 + d_inner: int = 3072 + hidden_act: str = "gelu_new" + hidden_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.0 + initializer_range: float = 0.1 + initializer_std: float | None = None + layer_norm_eps: float = 1e-9 + pooling_type: str = "mean" + attention_type: str = "relative_shift" + separate_cls: bool = True + truncate_seq: bool = True + pool_q_only: bool = True + pad_token_id: int | None = None + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.block_repeats = [1] * len(self.block_sizes) if self.block_repeats is None else self.block_repeats + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if len(self.block_sizes) != len(self.block_repeats): + raise ValueError("`block_sizes` and `block_repeats` should have the same length.") + if self.pooling_type not in [ "mean", "max", - ], f"Got {pooling_type} for `pooling_type` but only 'mean' and 'max' are supported." - self.pooling_type = pooling_type - assert attention_type in [ + ]: + raise ValueError(f"Got {self.pooling_type} for `pooling_type` but only 'mean' and 'max' are supported.") + if self.attention_type not in [ "relative_shift", "factorized", - ], f"Got {attention_type} for `attention_type` but only 'relative_shift' and 'factorized' are supported." - self.attention_type = attention_type - self.separate_cls = separate_cls - self.truncate_seq = truncate_seq - self.pool_q_only = pool_q_only - - super().__init__(**kwargs) + ]: + raise ValueError( + f"Got {self.attention_type} for `attention_type` but only 'relative_shift' and 'factorized' are supported." + ) @property def num_hidden_layers(self): diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py index b973fc473b45..eb378ebf7b09 100644 --- a/src/transformers/models/funnel/modeling_funnel.py +++ b/src/transformers/models/funnel/modeling_funnel.py @@ -770,7 +770,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -837,7 +837,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -945,7 +945,7 @@ def forward( >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> logits = model(**inputs).logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict discriminator_hidden_states = self.funnel( input_ids, @@ -1021,7 +1021,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.funnel( input_ids, @@ -1089,7 +1089,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.funnel( input_ids, @@ -1169,7 +1169,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1243,7 +1243,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.funnel( input_ids, @@ -1302,7 +1302,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.funnel( input_ids, diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index 8feffc824ab0..f61d14213a45 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -13,6 +13,8 @@ # limitations under the License. """Fuyu model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="adept/fuyu-8b") +@strict(accept_kwargs=True) class FuyuConfig(PreTrainedConfig): r""" Example: @@ -39,82 +42,59 @@ class FuyuConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] default_theta = 25000.0 - def __init__( - self, - vocab_size: int | None = 262144, - hidden_size: int | None = 4096, - intermediate_size: int | None = 16384, - num_hidden_layers: int | None = 36, - num_attention_heads: int | None = 64, - hidden_act: str | None = "relu2", - max_position_embeddings: int | None = 16384, - image_size: int | None = 300, - patch_size: int | None = 30, - num_channels: int | None = 3, - initializer_range: float | None = 0.02, - layer_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - qk_layernorm: bool | None = True, - hidden_dropout: float | None = 0.0, - attention_dropout: float | None = 0.0, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - image_token_id: int | None = 71011, - text_config: dict | None = None, - **kwargs, - ): - if text_config is None: + vocab_size: int = 262144 + hidden_size: int = 4096 + intermediate_size: int = 16384 + num_hidden_layers: int = 36 + num_attention_heads: int = 64 + hidden_act: str = "relu2" + max_position_embeddings: int = 16384 + image_size: int | None = 300 + patch_size: int | None = 30 + num_channels: int | None = 3 + initializer_range: float = 0.02 + layer_norm_eps: float | None = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + qk_layernorm: bool | None = True + hidden_dropout: float | int | None = 0.0 + attention_dropout: float | int | None = 0.0 + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + image_token_id: int | None = 71011 + text_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): + if self.text_config is None: text_config = { - "vocab_size": vocab_size, - "max_position_embeddings": max_position_embeddings, - "hidden_size": hidden_size, - "intermediate_size": intermediate_size, - "num_hidden_layers": num_hidden_layers, - "num_attention_heads": num_attention_heads, - "hidden_act": hidden_act, - "initializer_range": initializer_range, - "layer_norm_eps": layer_norm_eps, - "use_cache": use_cache, - "rope_parameters": rope_parameters, - "qk_layernorm": qk_layernorm, - "hidden_dropout": hidden_dropout, - "attention_dropout": attention_dropout, - "pad_token_id": pad_token_id, - "bos_token_id": bos_token_id, - "eos_token_id": eos_token_id, + "vocab_size": self.vocab_size, + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "intermediate_size": self.intermediate_size, + "num_hidden_layers": self.num_hidden_layers, + "num_attention_heads": self.num_attention_heads, + "hidden_act": self.hidden_act, + "initializer_range": self.initializer_range, + "layer_norm_eps": self.layer_norm_eps, + "use_cache": self.use_cache, + "rope_parameters": self.rope_parameters, + "qk_layernorm": self.qk_layernorm, + "hidden_dropout": self.hidden_dropout, + "attention_dropout": self.attention_dropout, + "pad_token_id": self.pad_token_id, + "bos_token_id": self.bos_token_id, + "eos_token_id": self.eos_token_id, } logger.info("text_config is None. initializing the text model with default values.") - text_model_type = text_config.get("model_type", "persimmon") - self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + self.text_config = CONFIG_MAPPING["persimmon"](**text_config) + elif isinstance(self.text_config, dict): + text_model_type = self.text_config.get("model_type", "persimmon") + self.text_config = CONFIG_MAPPING[text_model_type](**self.text_config) - self._vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.qk_layernorm = qk_layernorm - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.image_token_id = image_token_id - self.rope_parameters = rope_parameters kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["FuyuConfig"] diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index c42c908b09fd..001b1f108ea7 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -18,12 +18,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="google/gemma-7b") +@strict(accept_kwargs=True) class GemmaConfig(PreTrainedConfig): r""" use_bidirectional_attention (`bool`, *optional*): @@ -56,52 +61,26 @@ class GemmaConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 3072, - intermediate_size: int | None = 24576, - num_hidden_layers: int | None = 28, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 16, - head_dim: int | None = 256, - hidden_act: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - use_bidirectional_attention: bool | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.use_bidirectional_attention = use_bidirectional_attention - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + vocab_size: int = 256000 + hidden_size: int = 3072 + intermediate_size: int = 24576 + num_hidden_layers: int = 28 + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + head_dim: int = 256 + hidden_act: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + bos_token_id: int | None = 2 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + use_bidirectional_attention: bool | None = None __all__ = ["GemmaConfig"] diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index cb849150fc62..c6c5a55b8790 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -19,6 +19,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from collections.abc import Callable from typing import Optional diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index 1a3529d69ae6..06b7d0709825 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -12,9 +12,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import TYPE_CHECKING + import torch +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -38,18 +39,15 @@ ) -if TYPE_CHECKING: - pass - VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} SPIECE_UNDERLINE = "▁" - logger = logging.get_logger(__name__) @auto_docstring(checkpoint="google/gemma-7b") +@strict(accept_kwargs=True) class GemmaConfig(PreTrainedConfig): r""" use_bidirectional_attention (`bool`, *optional*): @@ -82,52 +80,26 @@ class GemmaConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 3072, - intermediate_size: int | None = 24576, - num_hidden_layers: int | None = 28, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 16, - head_dim: int | None = 256, - hidden_act: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - use_bidirectional_attention: bool | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.use_bidirectional_attention = use_bidirectional_attention - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + vocab_size: int = 256000 + hidden_size: int = 3072 + intermediate_size: int = 24576 + num_hidden_layers: int = 28 + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + head_dim: int = 256 + hidden_act: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + bos_token_id: int | None = 2 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + use_bidirectional_attention: bool | None = None class GemmaTextScaledWordEmbedding(nn.Embedding): diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index 059e659fe6e8..93713d6df477 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -18,12 +18,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="google/gemma2-7b") +@strict(accept_kwargs=True) class Gemma2Config(PreTrainedConfig): r""" query_pre_attn_scalar (`float`, *optional*, defaults to 256): @@ -62,69 +65,47 @@ class Gemma2Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = 30.0, - attn_logit_softcapping: float | None = 50.0, - use_bidirectional_attention: bool | None = None, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.query_pre_attn_scalar = query_pre_attn_scalar - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.attn_logit_softcapping = attn_logit_softcapping - self.layer_types = layer_types - self.use_bidirectional_attention = use_bidirectional_attention + vocab_size: int = 256000 + hidden_size: int = 2304 + intermediate_size: int = 9216 + num_hidden_layers: int = 26 + num_attention_heads: int = 8 + num_key_value_heads: int = 4 + head_dim: int = 256 + hidden_activation: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + bos_token_id: int | None = 2 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + query_pre_attn_scalar: int = 256 + sliding_window: int | None = 4096 + layer_types: list[str] | None = None + final_logit_softcapping: float | None = 30.0 + attn_logit_softcapping: float | None = 50.0 + use_bidirectional_attention: bool | None = None + def __post_init__(self, **kwargs): if self.layer_types is None: self.layer_types = [ "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters + super().__post_init__(**kwargs) - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) __all__ = ["Gemma2Config"] diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 63ee2874a4a4..5b68f113aa51 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -198,7 +198,7 @@ def eager_attention_forward( key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, - dropout: float = 0.0, + dropout: float | int = 0.0, scaling: float | None = None, softcap: float | None = None, **kwargs, diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index a6c1c4e758c5..73ae5ba66208 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -16,10 +16,11 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer @@ -52,6 +53,7 @@ @auto_docstring(checkpoint="google/gemma2-7b") +@strict(accept_kwargs=True) class Gemma2Config(PreTrainedConfig): r""" query_pre_attn_scalar (`float`, *optional*, defaults to 256): @@ -90,69 +92,47 @@ class Gemma2Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = 30.0, - attn_logit_softcapping: float | None = 50.0, - use_bidirectional_attention: bool | None = None, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.query_pre_attn_scalar = query_pre_attn_scalar - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.attn_logit_softcapping = attn_logit_softcapping - self.layer_types = layer_types - self.use_bidirectional_attention = use_bidirectional_attention - + vocab_size: int = 256000 + hidden_size: int = 2304 + intermediate_size: int = 9216 + num_hidden_layers: int = 26 + num_attention_heads: int = 8 + num_key_value_heads: int = 4 + head_dim: int = 256 + hidden_activation: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + bos_token_id: int | None = 2 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + query_pre_attn_scalar: int = 256 + sliding_window: int | None = 4096 + layer_types: list[str] | None = None + final_logit_softcapping: float | None = 30.0 + attn_logit_softcapping: float | None = 50.0 + use_bidirectional_attention: bool | None = None + + def __post_init__(self, **kwargs): if self.layer_types is None: self.layer_types = [ "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters + super().__post_init__(**kwargs) - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) class Gemma2RMSNorm(GemmaRMSNorm): @@ -204,7 +184,7 @@ def eager_attention_forward( key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, - dropout: float = 0.0, + dropout: float | int = 0.0, scaling: float | None = None, softcap: float | None = None, **kwargs, diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index 6fc37c42c156..8ad43ed6cf5c 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -18,10 +18,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Literal +from typing import Any -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..siglip import SiglipVisionConfig @@ -30,6 +31,7 @@ @auto_docstring(checkpoint="google/gemma-3-4b-it") +@strict(accept_kwargs=True) class Gemma3TextConfig(PreTrainedConfig): r""" final_logit_softcapping (`float`, *optional*): @@ -71,63 +73,36 @@ class Gemma3TextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + + vocab_size: int = 262_208 + hidden_size: int = 2304 + intermediate_size: int = 9216 + num_hidden_layers: int = 26 + num_attention_heads: int = 8 + num_key_value_heads: int = 4 + head_dim: int = 256 + hidden_activation: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 131_072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + bos_token_id: int | None = 2 + tie_word_embeddings: bool = True + rope_parameters: dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + query_pre_attn_scalar: int = 256 + sliding_window: int | None = 4096 + layer_types: list[str] | None = None + final_logit_softcapping: float | None = None + attn_logit_softcapping: float | None = None + use_bidirectional_attention: bool | None = False default_theta = {"global": 1_000_000.0, "local": 10_000.0} - def __init__( - self, - vocab_size: int | None = 262_208, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 131_072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = None, - attn_logit_softcapping: float | None = None, - rope_parameters: dict[Literal["full_attention", "sliding_attention"], RopeParameters] | None = None, - use_bidirectional_attention: bool | None = False, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.query_pre_attn_scalar = query_pre_attn_scalar - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.attn_logit_softcapping = attn_logit_softcapping - self.layer_types = layer_types - - self.use_bidirectional_attention = use_bidirectional_attention - if use_bidirectional_attention: + def __post_init__(self, **kwargs): + if self.use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub @@ -138,12 +113,18 @@ def __init__( "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) + + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -170,11 +151,11 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs @auto_docstring(checkpoint="google/gemma-3-4b-it") +@strict(accept_kwargs=True) class Gemma3Config(PreTrainedConfig): r""" mm_tokens_per_image (`int`, *optional*, defaults to 256): @@ -216,40 +197,29 @@ class Gemma3Config(PreTrainedConfig): "vision_config": SiglipVisionConfig, } - def __init__( - self, - text_config: Gemma3TextConfig | dict[str, Any] | None = None, - vision_config: SiglipVisionConfig | dict[str, Any] | None = None, - mm_tokens_per_image: int | None = 256, - boi_token_index: int | None = 255_999, - eoi_token_index: int | None = 256_000, - image_token_index: int | None = 262_144, - initializer_range: float | None = 0.02, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if text_config is None: - text_config = Gemma3TextConfig() + text_config: Gemma3TextConfig | dict[str, Any] | None = None + vision_config: SiglipVisionConfig | dict[str, Any] | None = None + mm_tokens_per_image: int | None = 256 + boi_token_index: int | None = 255_999 + eoi_token_index: int | None = 256_000 + image_token_index: int | None = 262_144 + initializer_range: float | None = 0.02 + tie_word_embeddings: bool | None = True + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = Gemma3TextConfig() logger.info("text_config is None, using default Gemma3TextConfig text config.") - elif isinstance(text_config, dict): - text_config = Gemma3TextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = Gemma3TextConfig(**self.text_config) - if isinstance(vision_config, dict): - vision_config = SiglipVisionConfig(**vision_config) - elif vision_config is None: - vision_config = SiglipVisionConfig() + if isinstance(self.vision_config, dict): + self.vision_config = SiglipVisionConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = SiglipVisionConfig() logger.info("vision_config is None, using default SiglipVisionConfig vision config.") - self.text_config = text_config - self.vision_config = vision_config - self.mm_tokens_per_image = mm_tokens_per_image - self.boi_token_index = boi_token_index - self.eoi_token_index = eoi_token_index - self.image_token_index = image_token_index - self.initializer_range = initializer_range - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Gemma3Config", "Gemma3TextConfig"] diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index a83e3332c1fb..aa1473c69a57 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -281,7 +281,7 @@ def eager_attention_forward( key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, - dropout: float = 0.0, + dropout: float | int = 0.0, scaling: float | None = None, softcap: float | None = None, **kwargs, diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 3ffcd97373cd..f01ea9c33f68 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -13,20 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. from collections.abc import Callable -from typing import Any, Literal, Optional +from typing import Any, Optional import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_masks_for_generate, create_sliding_window_causal_mask from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, SequenceClassifierOutputWithPast from ...modeling_rope_utils import ( ROPE_INIT_FUNCTIONS, - RopeParameters, dynamic_rope_update, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel @@ -60,6 +60,7 @@ @auto_docstring(checkpoint="google/gemma-3-4b-it") +@strict(accept_kwargs=True) class Gemma3TextConfig(Gemma2Config, PreTrainedConfig): r""" final_logit_softcapping (`float`, *optional*): @@ -97,61 +98,16 @@ class Gemma3TextConfig(Gemma2Config, PreTrainedConfig): } default_theta = {"global": 1_000_000.0, "local": 10_000.0} - def __init__( - self, - vocab_size: int | None = 262_208, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 131_072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = None, - attn_logit_softcapping: float | None = None, - rope_parameters: dict[Literal["full_attention", "sliding_attention"], RopeParameters] | None = None, - use_bidirectional_attention: bool | None = False, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.query_pre_attn_scalar = query_pre_attn_scalar - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.attn_logit_softcapping = attn_logit_softcapping - self.layer_types = layer_types - - self.use_bidirectional_attention = use_bidirectional_attention - if use_bidirectional_attention: + vocab_size: int = 262_208 + max_position_embeddings: int = 131_072 + layer_types: list[str] | None = None + final_logit_softcapping: float | None = None + attn_logit_softcapping: float | None = None + rope_parameters: dict | None = None + use_bidirectional_attention: bool | None = False + + def __post_init__(self, **kwargs): + if self.use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub @@ -162,12 +118,10 @@ def __init__( "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - PreTrainedConfig.__init__(**kwargs) + PreTrainedConfig.__post_init__(**kwargs) - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -194,11 +148,11 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs @auto_docstring(checkpoint="google/gemma-3-4b-it") +@strict(accept_kwargs=True) class Gemma3Config(PreTrainedConfig): r""" mm_tokens_per_image (`int`, *optional*, defaults to 256): @@ -240,40 +194,29 @@ class Gemma3Config(PreTrainedConfig): "vision_config": SiglipVisionConfig, } - def __init__( - self, - text_config: Gemma3TextConfig | dict[str, Any] | None = None, - vision_config: SiglipVisionConfig | dict[str, Any] | None = None, - mm_tokens_per_image: int | None = 256, - boi_token_index: int | None = 255_999, - eoi_token_index: int | None = 256_000, - image_token_index: int | None = 262_144, - initializer_range: float | None = 0.02, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if text_config is None: - text_config = Gemma3TextConfig() + text_config: Gemma3TextConfig | dict[str, Any] | None = None + vision_config: SiglipVisionConfig | dict[str, Any] | None = None + mm_tokens_per_image: int | None = 256 + boi_token_index: int | None = 255_999 + eoi_token_index: int | None = 256_000 + image_token_index: int | None = 262_144 + initializer_range: float | None = 0.02 + tie_word_embeddings: bool | None = True + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = Gemma3TextConfig() logger.info("text_config is None, using default Gemma3TextConfig text config.") - elif isinstance(text_config, dict): - text_config = Gemma3TextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = Gemma3TextConfig(**self.text_config) - if isinstance(vision_config, dict): - vision_config = SiglipVisionConfig(**vision_config) - elif vision_config is None: - vision_config = SiglipVisionConfig() + if isinstance(self.vision_config, dict): + self.vision_config = SiglipVisionConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = SiglipVisionConfig() logger.info("vision_config is None, using default SiglipVisionConfig vision config.") - self.text_config = text_config - self.vision_config = vision_config - self.mm_tokens_per_image = mm_tokens_per_image - self.boi_token_index = boi_token_index - self.eoi_token_index = eoi_token_index - self.image_token_index = image_token_index - self.initializer_range = initializer_range - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) class Gemma3ModelOutputWithPast(PaligemmaModelOutputWithPast): diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index dd8663be96b4..8aa57366e718 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -19,10 +19,11 @@ # See the License for the specific language governing permissions and # limitations under the License. from collections.abc import Sequence -from typing import Any, Literal +from typing import Any -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, is_timm_available, logging, requires_backends @@ -34,10 +35,9 @@ @auto_docstring(checkpoint="google/gemma-3n-E4B") +@strict(accept_kwargs=True) class Gemma3nTextConfig(PreTrainedConfig): r""" - query_pre_attn_scalar (`float`, *optional*, defaults to 256): - scaling factor used on the attention scores vocab_size_per_layer_input (`int`, *optional*, defaults to 262144): Vocabulary size of the per-layer text embeddings that augment the standard embeddings. hidden_size_per_layer_input (`int`, *optional*, defaults to 256): @@ -95,110 +95,80 @@ class Gemma3nTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - default_theta = {"global": 1_000_000.0, "local": 10_000.0} - def __init__( - self, - vocab_size: int = 262_400, - vocab_size_per_layer_input: int = 262_144, - hidden_size: int = 2048, - hidden_size_per_layer_input: int = 256, - intermediate_size: int | Sequence[int] = 16_384, - num_hidden_layers: int = 35, - num_attention_heads: int = 8, - num_key_value_heads: int = 2, - head_dim: int = 256, - hidden_activation: str = "gelu_pytorch_tanh", - max_position_embeddings: int = 32_768, - initializer_range: float = 0.02, - rms_norm_eps: float = 1e-6, - use_cache: bool = True, - pad_token_id: int = 0, - eos_token_id: int = 1, - bos_token_id: int = 2, - rope_parameters: dict[Literal["sliding_attention", "full_attention"], RopeParameters] | None = None, - attention_bias: bool = False, - attention_dropout: float = 0.0, - sliding_window: int = 512, - layer_types: Sequence[str] | None = None, - final_logit_softcapping: float = 30.0, - altup_active_idx: int = 0, - altup_coef_clip: float = 120.0, - altup_correct_scale: bool = True, - altup_num_inputs: int = 4, - num_kv_shared_layers: int = 15, - laurel_rank: int = 64, - activation_sparsity_pattern: float | Sequence[float] | None = None, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if isinstance(intermediate_size, Sequence) and (intsize_len := len(intermediate_size)) != num_hidden_layers: + vocab_size: int = 262_400 + hidden_size: int = 2048 + intermediate_size: int | list[int] = 16_384 + num_hidden_layers: int = 35 + num_attention_heads: int = 8 + num_key_value_heads: int = 2 + head_dim: int = 256 + hidden_activation: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 32_768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + bos_token_id: int | None = 2 + tie_word_embeddings: bool = True + rope_parameters: dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + sliding_window: int = 512 + layer_types: list[str] | None = None + final_logit_softcapping: float = 30.0 + default_theta = {"global": 1_000_000.0, "local": 10_000.0} + vocab_size_per_layer_input: int = 262_144 + hidden_size_per_layer_input: int = 256 + altup_active_idx: int = 0 + altup_coef_clip: float = 120.0 + altup_correct_scale: bool = True + altup_num_inputs: int = 4 + num_kv_shared_layers: int = 15 + laurel_rank: int = 64 + activation_sparsity_pattern: float | list[float] | None = None + + def __post_init__(self, **kwargs): + if ( + isinstance(self.intermediate_size, Sequence) + and (intsize_len := len(self.intermediate_size)) != self.num_hidden_layers + ): raise ValueError( "intermediate_size must have an explicit intermediate size for every layer or one for all layers. " - f"Expected {num_hidden_layers} values but got {intsize_len}." + f"Expected {self.num_hidden_layers} values but got {intsize_len}." ) - elif not isinstance(intermediate_size, Sequence): - intermediate_size = [intermediate_size] * num_hidden_layers - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.vocab_size_per_layer_input = vocab_size_per_layer_input - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.layer_types = layer_types - - if layer_types is None: + elif not isinstance(self.intermediate_size, Sequence): + self.intermediate_size = [self.intermediate_size] * self.num_hidden_layers + + if self.layer_types is None: self.layer_types = [ "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers) ] - else: - self.layer_types = layer_types - - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.hidden_size_per_layer_input = hidden_size_per_layer_input - self.num_kv_shared_layers = num_kv_shared_layers - self.altup_active_idx = altup_active_idx - self.altup_coef_clip = altup_coef_clip - self.altup_correct_scale = altup_correct_scale - self.altup_num_inputs = altup_num_inputs + if self.activation_sparsity_pattern is None: + num_sparse_layers = 10 if self.num_hidden_layers > 10 else 0 + self.activation_sparsity_pattern = [0.95] * num_sparse_layers + [0.0] * ( + self.num_hidden_layers - num_sparse_layers + ) - self.laurel_rank = laurel_rank + if (len_asp := len(self.activation_sparsity_pattern)) != self.num_hidden_layers: + raise ValueError( + "activation_sparsity_pattern must have an explicit activation sparsity value for every layer." + f"Expected {self.num_hidden_layers} values but got {len_asp}." + ) - if activation_sparsity_pattern is None: - num_sparse_layers = 10 if num_hidden_layers > 10 else 0 - activation_sparsity_pattern = [0.95] * num_sparse_layers + [0.0] * (num_hidden_layers - num_sparse_layers) + super().__post_init__(**kwargs) - if (len_asp := len(activation_sparsity_pattern)) != num_hidden_layers: + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: raise ValueError( - "activation_sparsity_pattern must have an explicit activation sparsity value for every layer." - f"Expected {num_hidden_layers} values but got {len_asp}." + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." ) - self.activation_sparsity_pattern = activation_sparsity_pattern - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) - - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -225,11 +195,11 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs @auto_docstring(checkpoint="google/gemma-3n-E4B") +@strict(accept_kwargs=True) class Gemma3nAudioConfig(PreTrainedConfig): r""" vocab_offset (`int`, *optional*, defaults to 262272): @@ -301,58 +271,35 @@ class Gemma3nAudioConfig(PreTrainedConfig): model_type = "gemma3n_audio" - def __init__( - self, - vocab_size: int = 128, - vocab_offset: int = 262_144 + 128, # text vocab size + vision vocab size - input_feat_size: int = 128, - hidden_size: int = 1536, - rms_norm_eps: float = 1e-6, - gradient_clipping: float = 10_000_000_000.0, - conf_attention_chunk_size: int = 12, - conf_attention_context_left: int = 13, - conf_attention_context_right: int = 0, - conf_attention_logit_cap: float = 50.0, - conf_num_attention_heads: int = 8, - conf_num_hidden_layers: int = 12, - conf_conv_kernel_size: int = 5, - conf_reduction_factor: int = 4, - conf_residual_weight: float = 0.5, - sscp_conv_channel_size: tuple[int, int] = (128, 32), - sscp_conv_group_norm_eps: float = 1e-3, - sscp_conv_kernel_size: tuple[tuple[int, int], tuple[int, int]] = ( - (3, 3), - (3, 3), - ), - sscp_conv_stride_size: tuple[tuple[int, int], tuple[int, int]] = ( - (2, 2), - (2, 2), - ), - **kwargs, - ): - super().__init__(**kwargs) - self.input_feat_size = input_feat_size - self.hidden_size = hidden_size - self.rms_norm_eps = rms_norm_eps - self.vocab_size = vocab_size - self.vocab_offset = vocab_offset - self.gradient_clipping = gradient_clipping - self.conf_attention_chunk_size = conf_attention_chunk_size - self.conf_attention_context_left = conf_attention_context_left - self.conf_attention_context_right = conf_attention_context_right - self.conf_attention_logit_cap = conf_attention_logit_cap - self.conf_num_attention_heads = conf_num_attention_heads - self.conf_num_hidden_layers = conf_num_hidden_layers - self.conf_conv_kernel_size = conf_conv_kernel_size - self.conf_reduction_factor = conf_reduction_factor - self.conf_residual_weight = conf_residual_weight - self.sscp_conv_channel_size = sscp_conv_channel_size - self.sscp_conv_group_norm_eps = sscp_conv_group_norm_eps - self.sscp_conv_kernel_size = sscp_conv_kernel_size - self.sscp_conv_stride_size = sscp_conv_stride_size + vocab_size: int = 128 + vocab_offset: int = 262_144 + 128 # text vocab size + vision vocab size + input_feat_size: int = 128 + hidden_size: int = 1536 + rms_norm_eps: float = 1e-6 + gradient_clipping: float = 10_000_000_000.0 + conf_attention_chunk_size: int = 12 + conf_attention_context_left: int = 13 + conf_attention_context_right: int = 0 + conf_attention_logit_cap: float = 50.0 + conf_num_attention_heads: int = 8 + conf_num_hidden_layers: int = 12 + conf_conv_kernel_size: int = 5 + conf_reduction_factor: int = 4 + conf_residual_weight: float = 0.5 + sscp_conv_channel_size: list[int] | tuple[int, int] = (128, 32) + sscp_conv_group_norm_eps: float = 1e-3 + sscp_conv_kernel_size: list | tuple[tuple[int, int], tuple[int, int]] = ( + (3, 3), + (3, 3), + ) + sscp_conv_stride_size: list | tuple[tuple[int, int], tuple[int, int]] = ( + (2, 2), + (2, 2), + ) @auto_docstring(checkpoint="google/gemma-3n-E4B") +@strict(accept_kwargs=True) class Gemma3nVisionConfig(PreTrainedConfig): r""" architecture (`str`, *optional*, defaults to `"resnet50"`): @@ -382,31 +329,15 @@ class Gemma3nVisionConfig(PreTrainedConfig): """ model_type = "gemma3n_vision" + architecture: str = "mobilenetv5_300m_enc" - def __init__( - self, - initializer_range: float = 0.02, - do_pooling: bool = False, - architecture: str = "mobilenetv5_300m_enc", - hidden_size: int = 2048, - vocab_size: int = 128, - vocab_offset: int = 262_144, - rms_norm_eps: float = 1e-06, - model_args: dict | None = None, - **kwargs, - ): - self.architecture = architecture - self.initializer_range = initializer_range - self.do_pooling = do_pooling - self.hidden_size = hidden_size - self.vocab_size = vocab_size - self.vocab_offset = vocab_offset - self.rms_norm_eps = rms_norm_eps - self.architecture = architecture - self.initializer_range = initializer_range - self.do_pooling = do_pooling - self.model_args = model_args # named "model_args" for BC with timm - super().__init__(**kwargs) + initializer_range: float = 0.02 + do_pooling: bool = False + model_args: dict | None = None + hidden_size: int = 2048 + vocab_size: int = 128 + vocab_offset: int = 262_144 + rms_norm_eps: float = 1e-06 @classmethod def from_dict(cls, config_dict: dict[str, Any], **kwargs): @@ -461,6 +392,7 @@ def to_dict(self) -> dict[str, Any]: @auto_docstring(checkpoint="google/gemma-3n-E4B") +@strict(accept_kwargs=True) class Gemma3nConfig(PreTrainedConfig): r""" audio_soft_tokens_per_image (`int`, *optional*, defaults to 188): @@ -507,56 +439,40 @@ class Gemma3nConfig(PreTrainedConfig): "audio_config": Gemma3nAudioConfig, } - def __init__( - self, - text_config: Gemma3nTextConfig | dict[str, Any] | None = None, - vision_config: Gemma3nVisionConfig | dict[str, Any] | None = None, - audio_config: Gemma3nAudioConfig | dict[str, Any] | None = None, - audio_soft_tokens_per_image: int | None = 188, - vision_soft_tokens_per_image: int | None = 256, - boi_token_id: int | None = 255_999, - eoi_token_id: int | None = 262_144, - image_token_id: int | None = 262_145, - boa_token_id: int | None = 256_000, - eoa_token_id: int | None = 262_272, - audio_token_id: int | None = 262_273, - initializer_range: float | None = 0.02, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if isinstance(text_config, dict): - text_config = Gemma3nTextConfig(**text_config) - elif text_config is None: - text_config = Gemma3nTextConfig() - logger.info("text_config is None. Using default Gemma3nTextConfig.") - - if isinstance(vision_config, dict): - vision_config = Gemma3nVisionConfig(**vision_config) - elif vision_config is None: - vision_config = Gemma3nVisionConfig() - logger.info("vision_config is None. Using default Gemma3nVisionConfig.") - - if isinstance(audio_config, dict): - audio_config = Gemma3nAudioConfig(**audio_config) - elif audio_config is None: - audio_config = Gemma3nAudioConfig() + text_config: Gemma3nTextConfig | dict[str, Any] | None = None + vision_config: Gemma3nVisionConfig | dict[str, Any] | None = None + audio_config: Gemma3nAudioConfig | dict[str, Any] | None = None + audio_soft_tokens_per_image: int | None = 188 + vision_soft_tokens_per_image: int | None = 256 + boi_token_id: int | None = 255_999 + eoi_token_id: int | None = 262_144 + image_token_id: int | None = 262_145 + boa_token_id: int | None = 256_000 + eoa_token_id: int | None = 262_272 + audio_token_id: int | None = 262_273 + initializer_range: float | None = 0.02 + tie_word_embeddings: bool | None = True + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = Gemma3nTextConfig() + logger.info("text_config is None, using default Gemma3nTextConfig text config.") + elif isinstance(self.text_config, dict): + self.text_config = Gemma3nTextConfig(**self.text_config) + + if isinstance(self.vision_config, dict): + self.vision_config = Gemma3nVisionConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = Gemma3nVisionConfig() + logger.info("vision_config is None, using default Gemma3nVisionConfig vision config.") + + if isinstance(self.audio_config, dict): + self.audio_config = Gemma3nAudioConfig(**self.audio_config) + elif self.audio_config is None: + self.audio_config = Gemma3nAudioConfig() logger.info("audio_config is None. Using default Gemma3nAudioConfig.") - self.text_config = text_config - self.vision_config = vision_config - self.audio_config = audio_config - - self.audio_soft_tokens_per_image = audio_soft_tokens_per_image - self.vision_soft_tokens_per_image = vision_soft_tokens_per_image - self.boi_token_id = boi_token_id - self.eoi_token_id = eoi_token_id - self.image_token_id = image_token_id - self.boa_token_id = boa_token_id - self.eoa_token_id = eoa_token_id - self.audio_token_id = audio_token_id - self.initializer_range = initializer_range - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Gemma3nAudioConfig", "Gemma3nConfig", "Gemma3nTextConfig", "Gemma3nVisionConfig"] diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 27c9163e3e4b..afa727ca6a4d 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -1116,7 +1116,7 @@ def eager_attention_forward( key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, - dropout: float = 0.0, + dropout: float | int = 0.0, scaling: float | None = None, softcap: float | None = None, **kwargs, diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index f55fead4560c..524773209d9f 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -15,32 +15,33 @@ import math from collections.abc import Callable, Sequence from dataclasses import dataclass -from typing import Any, Literal +from typing import Any import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, RopeParameters +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check from ...utils.generic import merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel -from ..gemma2.configuration_gemma2 import Gemma2Config from ..gemma2.modeling_gemma2 import ( Gemma2MLP, Gemma2PreTrainedModel, eager_attention_forward, rotate_half, ) +from ..gemma3.configuration_gemma3 import Gemma3TextConfig from ..gemma3.modeling_gemma3 import ( Gemma3Attention, Gemma3DecoderLayer, @@ -63,10 +64,9 @@ @auto_docstring(checkpoint="google/gemma-3n-E4B") -class Gemma3nTextConfig(Gemma2Config, PreTrainedConfig): +@strict(accept_kwargs=True) +class Gemma3nTextConfig(Gemma3TextConfig): r""" - query_pre_attn_scalar (`float`, *optional*, defaults to 256): - scaling factor used on the attention scores vocab_size_per_layer_input (`int`, *optional*, defaults to 262144): Vocabulary size of the per-layer text embeddings that augment the standard embeddings. hidden_size_per_layer_input (`int`, *optional*, defaults to 256): @@ -120,108 +120,60 @@ class Gemma3nTextConfig(Gemma2Config, PreTrainedConfig): } default_theta = {"global": 1_000_000.0, "local": 10_000.0} - def __init__( - self, - vocab_size: int = 262_400, - vocab_size_per_layer_input: int = 262_144, - hidden_size: int = 2048, - hidden_size_per_layer_input: int = 256, - intermediate_size: int | Sequence[int] = 16_384, - num_hidden_layers: int = 35, - num_attention_heads: int = 8, - num_key_value_heads: int = 2, - head_dim: int = 256, - hidden_activation: str = "gelu_pytorch_tanh", - max_position_embeddings: int = 32_768, - initializer_range: float = 0.02, - rms_norm_eps: float = 1e-6, - use_cache: bool = True, - pad_token_id: int = 0, - eos_token_id: int = 1, - bos_token_id: int = 2, - rope_parameters: dict[Literal["sliding_attention", "full_attention"], RopeParameters] | None = None, - attention_bias: bool = False, - attention_dropout: float = 0.0, - sliding_window: int = 512, - layer_types: Sequence[str] | None = None, - final_logit_softcapping: float = 30.0, - altup_active_idx: int = 0, - altup_coef_clip: float = 120.0, - altup_correct_scale: bool = True, - altup_num_inputs: int = 4, - num_kv_shared_layers: int = 15, - laurel_rank: int = 64, - activation_sparsity_pattern: float | Sequence[float] | None = None, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if isinstance(intermediate_size, Sequence) and (intsize_len := len(intermediate_size)) != num_hidden_layers: + vocab_size: int = 262_400 + vocab_size_per_layer_input: int = 262_144 + hidden_size: int = 2048 + hidden_size_per_layer_input: int = 256 + intermediate_size: int | list[int] = 16_384 + num_hidden_layers: int = 35 + num_key_value_heads: int = 2 + max_position_embeddings: int = 32_768 + sliding_window: int = 512 + layer_types: list[str] | None = None + final_logit_softcapping: float = 30.0 + altup_active_idx: int = 0 + altup_coef_clip: float = 120.0 + altup_correct_scale: bool = True + altup_num_inputs: int = 4 + num_kv_shared_layers: int = 15 + laurel_rank: int = 64 + activation_sparsity_pattern: float | list[float] | None = None + attn_logit_softcapping = AttributeError() + use_bidirectional_attention = AttributeError() + query_pre_attn_scalar = AttributeError() + + def __post_init__(self, **kwargs): + if ( + isinstance(self.intermediate_size, Sequence) + and (intsize_len := len(self.intermediate_size)) != self.num_hidden_layers + ): raise ValueError( "intermediate_size must have an explicit intermediate size for every layer or one for all layers. " - f"Expected {num_hidden_layers} values but got {intsize_len}." + f"Expected {self.num_hidden_layers} values but got {intsize_len}." ) - elif not isinstance(intermediate_size, Sequence): - intermediate_size = [intermediate_size] * num_hidden_layers - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.vocab_size_per_layer_input = vocab_size_per_layer_input - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.layer_types = layer_types - - if layer_types is None: + elif not isinstance(self.intermediate_size, Sequence): + self.intermediate_size = [self.intermediate_size] * self.num_hidden_layers + + if self.layer_types is None: self.layer_types = [ "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers) ] - else: - self.layer_types = layer_types - - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.hidden_size_per_layer_input = hidden_size_per_layer_input - self.num_kv_shared_layers = num_kv_shared_layers - - self.altup_active_idx = altup_active_idx - self.altup_coef_clip = altup_coef_clip - self.altup_correct_scale = altup_correct_scale - self.altup_num_inputs = altup_num_inputs - self.laurel_rank = laurel_rank - - if activation_sparsity_pattern is None: - num_sparse_layers = 10 if num_hidden_layers > 10 else 0 - activation_sparsity_pattern = [0.95] * num_sparse_layers + [0.0] * (num_hidden_layers - num_sparse_layers) + if self.activation_sparsity_pattern is None: + num_sparse_layers = 10 if self.num_hidden_layers > 10 else 0 + self.activation_sparsity_pattern = [0.95] * num_sparse_layers + [0.0] * ( + self.num_hidden_layers - num_sparse_layers + ) - if (len_asp := len(activation_sparsity_pattern)) != num_hidden_layers: + if (len_asp := len(self.activation_sparsity_pattern)) != self.num_hidden_layers: raise ValueError( "activation_sparsity_pattern must have an explicit activation sparsity value for every layer." - f"Expected {num_hidden_layers} values but got {len_asp}." + f"Expected {self.num_hidden_layers} values but got {len_asp}." ) - self.activation_sparsity_pattern = activation_sparsity_pattern - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - PreTrainedConfig.__init__(**kwargs) - - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + + PreTrainedConfig.__post_init__(**kwargs) + + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -248,11 +200,11 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs @auto_docstring(checkpoint="google/gemma-3n-E4B") +@strict(accept_kwargs=True) class Gemma3nAudioConfig(PreTrainedConfig): r""" vocab_offset (`int`, *optional*, defaults to 262272): @@ -324,58 +276,35 @@ class Gemma3nAudioConfig(PreTrainedConfig): model_type = "gemma3n_audio" - def __init__( - self, - vocab_size: int = 128, - vocab_offset: int = 262_144 + 128, # text vocab size + vision vocab size - input_feat_size: int = 128, - hidden_size: int = 1536, - rms_norm_eps: float = 1e-6, - gradient_clipping: float = 10_000_000_000.0, - conf_attention_chunk_size: int = 12, - conf_attention_context_left: int = 13, - conf_attention_context_right: int = 0, - conf_attention_logit_cap: float = 50.0, - conf_num_attention_heads: int = 8, - conf_num_hidden_layers: int = 12, - conf_conv_kernel_size: int = 5, - conf_reduction_factor: int = 4, - conf_residual_weight: float = 0.5, - sscp_conv_channel_size: tuple[int, int] = (128, 32), - sscp_conv_group_norm_eps: float = 1e-3, - sscp_conv_kernel_size: tuple[tuple[int, int], tuple[int, int]] = ( - (3, 3), - (3, 3), - ), - sscp_conv_stride_size: tuple[tuple[int, int], tuple[int, int]] = ( - (2, 2), - (2, 2), - ), - **kwargs, - ): - super().__init__(**kwargs) - self.input_feat_size = input_feat_size - self.hidden_size = hidden_size - self.rms_norm_eps = rms_norm_eps - self.vocab_size = vocab_size - self.vocab_offset = vocab_offset - self.gradient_clipping = gradient_clipping - self.conf_attention_chunk_size = conf_attention_chunk_size - self.conf_attention_context_left = conf_attention_context_left - self.conf_attention_context_right = conf_attention_context_right - self.conf_attention_logit_cap = conf_attention_logit_cap - self.conf_num_attention_heads = conf_num_attention_heads - self.conf_num_hidden_layers = conf_num_hidden_layers - self.conf_conv_kernel_size = conf_conv_kernel_size - self.conf_reduction_factor = conf_reduction_factor - self.conf_residual_weight = conf_residual_weight - self.sscp_conv_channel_size = sscp_conv_channel_size - self.sscp_conv_group_norm_eps = sscp_conv_group_norm_eps - self.sscp_conv_kernel_size = sscp_conv_kernel_size - self.sscp_conv_stride_size = sscp_conv_stride_size + vocab_size: int = 128 + vocab_offset: int = 262_144 + 128 # text vocab size + vision vocab size + input_feat_size: int = 128 + hidden_size: int = 1536 + rms_norm_eps: float = 1e-6 + gradient_clipping: float = 10_000_000_000.0 + conf_attention_chunk_size: int = 12 + conf_attention_context_left: int = 13 + conf_attention_context_right: int = 0 + conf_attention_logit_cap: float = 50.0 + conf_num_attention_heads: int = 8 + conf_num_hidden_layers: int = 12 + conf_conv_kernel_size: int = 5 + conf_reduction_factor: int = 4 + conf_residual_weight: float = 0.5 + sscp_conv_channel_size: list[int] | tuple[int, int] = (128, 32) + sscp_conv_group_norm_eps: float = 1e-3 + sscp_conv_kernel_size: list | tuple[tuple[int, int], tuple[int, int]] = ( + (3, 3), + (3, 3), + ) + sscp_conv_stride_size: list | tuple[tuple[int, int], tuple[int, int]] = ( + (2, 2), + (2, 2), + ) @auto_docstring(checkpoint="google/gemma-3n-E4B") +@strict(accept_kwargs=True) class Gemma3nVisionConfig(TimmWrapperConfig): r""" architecture (`str`, *optional*, defaults to `"resnet50"`): @@ -406,29 +335,18 @@ class Gemma3nVisionConfig(TimmWrapperConfig): model_type = "gemma3n_vision" - def __init__( - self, - initializer_range: float = 0.02, - do_pooling: bool = False, - architecture: str = "mobilenetv5_300m_enc", - hidden_size: int = 2048, - vocab_size: int = 128, - vocab_offset: int = 262_144, - rms_norm_eps: float = 1e-06, - model_args: dict | None = None, - **kwargs, - ): - self.architecture = architecture - self.initializer_range = initializer_range - self.do_pooling = do_pooling - self.hidden_size = hidden_size - self.vocab_size = vocab_size - self.vocab_offset = vocab_offset - self.rms_norm_eps = rms_norm_eps - super().__init__(**kwargs) + initializer_range: float = 0.02 + do_pooling: bool = False + architecture: str = "mobilenetv5_300m_enc" + hidden_size: int = 2048 + vocab_size: int = 128 + vocab_offset: int = 262_144 + rms_norm_eps: float = 1e-06 + model_args: dict | None = None @auto_docstring(checkpoint="google/gemma-3n-E4B") +@strict(accept_kwargs=True) class Gemma3nConfig(PreTrainedConfig): r""" audio_soft_tokens_per_image (`int`, *optional*, defaults to 188): @@ -475,56 +393,40 @@ class Gemma3nConfig(PreTrainedConfig): "audio_config": Gemma3nAudioConfig, } - def __init__( - self, - text_config: Gemma3nTextConfig | dict[str, Any] | None = None, - vision_config: Gemma3nVisionConfig | dict[str, Any] | None = None, - audio_config: Gemma3nAudioConfig | dict[str, Any] | None = None, - audio_soft_tokens_per_image: int | None = 188, - vision_soft_tokens_per_image: int | None = 256, - boi_token_id: int | None = 255_999, - eoi_token_id: int | None = 262_144, - image_token_id: int | None = 262_145, - boa_token_id: int | None = 256_000, - eoa_token_id: int | None = 262_272, - audio_token_id: int | None = 262_273, - initializer_range: float | None = 0.02, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if isinstance(text_config, dict): - text_config = Gemma3nTextConfig(**text_config) - elif text_config is None: - text_config = Gemma3nTextConfig() - logger.info("text_config is None. Using default Gemma3nTextConfig.") - - if isinstance(vision_config, dict): - vision_config = Gemma3nVisionConfig(**vision_config) - elif vision_config is None: - vision_config = Gemma3nVisionConfig() - logger.info("vision_config is None. Using default Gemma3nVisionConfig.") - - if isinstance(audio_config, dict): - audio_config = Gemma3nAudioConfig(**audio_config) - elif audio_config is None: - audio_config = Gemma3nAudioConfig() + text_config: Gemma3nTextConfig | dict[str, Any] | None = None + vision_config: Gemma3nVisionConfig | dict[str, Any] | None = None + audio_config: Gemma3nAudioConfig | dict[str, Any] | None = None + audio_soft_tokens_per_image: int | None = 188 + vision_soft_tokens_per_image: int | None = 256 + boi_token_id: int | None = 255_999 + eoi_token_id: int | None = 262_144 + image_token_id: int | None = 262_145 + boa_token_id: int | None = 256_000 + eoa_token_id: int | None = 262_272 + audio_token_id: int | None = 262_273 + initializer_range: float | None = 0.02 + tie_word_embeddings: bool | None = True + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = Gemma3nTextConfig() + logger.info("text_config is None, using default Gemma3nTextConfig text config.") + elif isinstance(self.text_config, dict): + self.text_config = Gemma3nTextConfig(**self.text_config) + + if isinstance(self.vision_config, dict): + self.vision_config = Gemma3nVisionConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = Gemma3nVisionConfig() + logger.info("vision_config is None, using default Gemma3nVisionConfig vision config.") + + if isinstance(self.audio_config, dict): + self.audio_config = Gemma3nAudioConfig(**self.audio_config) + elif self.audio_config is None: + self.audio_config = Gemma3nAudioConfig() logger.info("audio_config is None. Using default Gemma3nAudioConfig.") - self.text_config = text_config - self.vision_config = vision_config - self.audio_config = audio_config - - self.audio_soft_tokens_per_image = audio_soft_tokens_per_image - self.vision_soft_tokens_per_image = vision_soft_tokens_per_image - self.boi_token_id = boi_token_id - self.eoi_token_id = eoi_token_id - self.image_token_id = image_token_id - self.boa_token_id = boa_token_id - self.eoa_token_id = eoa_token_id - self.audio_token_id = audio_token_id - self.initializer_range = initializer_range - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) @dataclass diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py index e5ffd9c8037c..804642bbe052 100644 --- a/src/transformers/models/git/configuration_git.py +++ b/src/transformers/models/git/configuration_git.py @@ -13,6 +13,8 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="microsoft/git-base") +@strict(accept_kwargs=True) class GitVisionConfig(PreTrainedConfig): r""" Example: @@ -41,37 +44,21 @@ class GitVisionConfig(PreTrainedConfig): model_type = "git_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=16, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 @auto_docstring(checkpoint="microsoft/git-base") +@strict(accept_kwargs=True) class GitConfig(PreTrainedConfig): r""" num_image_with_embedding (`int`, *optional*): @@ -95,52 +82,32 @@ class GitConfig(PreTrainedConfig): model_type = "git" sub_configs = {"vision_config": GitVisionConfig} - def __init__( - self, - vision_config=None, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=6, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=1024, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - use_cache=True, - tie_word_embeddings=False, - bos_token_id=101, - eos_token_id=102, - num_image_with_embedding=None, - **kwargs, - ): - if vision_config is None: - vision_config = {} + vision_config: dict | GitVisionConfig | None = None + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 6 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 1024 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + use_cache: bool = True + tie_word_embeddings: bool = False + bos_token_id: int | None = 101 + eos_token_id: int | None = 102 + num_image_with_embedding: int | None = None + + def __post_init__(self, **kwargs): + if self.vision_config is None: + self.vision_config = GitVisionConfig() logger.info("vision_config is None. initializing the GitVisionConfig with default values.") - - self.vision_config = GitVisionConfig(**vision_config) - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.num_image_with_embedding = num_image_with_embedding - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + elif isinstance(self.vision_config, dict): + self.vision_config = GitVisionConfig(**self.vision_config) + super().__post_init__(**kwargs) __all__ = ["GitConfig", "GitVisionConfig"] diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index c7cdf06f5049..8dca70bb7b7c 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -14,12 +14,15 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="THUDM/glm-4-9b-chat") +@strict(accept_kwargs=True) class GlmConfig(PreTrainedConfig): r""" Example: @@ -50,51 +53,31 @@ class GlmConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 151552, - hidden_size: int | None = 4096, - intermediate_size: int | None = 13696, - num_hidden_layers: int | None = 40, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 2, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - attention_dropout: float | None = 0.0, - max_position_embeddings: int | None = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 0.00000015625, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - pad_token_id: int | None = 151329, - eos_token_id: list[int] | None = [151329, 151336, 151338], - bos_token_id: int | None = None, - attention_bias: bool | None = True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC + vocab_size: int = 151552 + hidden_size: int = 4096 + intermediate_size: int = 13696 + num_hidden_layers: int = 40 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 2 + head_dim: int | None = 128 + hidden_act: str = "silu" + attention_dropout: float | int | None = 0.0 + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 0.00000015625 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + pad_token_id: int | None = 151329 + eos_token_id: int | list[int] | None = None + bos_token_id: int | None = None + attention_bias: bool = True - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + def __post_init__(self, **kwargs): + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC + if self.eos_token_id is None: + self.eos_token_id = [151329, 151336, 151338] + super().__post_init__(**kwargs) __all__ = ["GlmConfig"] diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 275d0c9fb0a9..477b7d22ae46 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -14,12 +14,15 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="zai-org/GLM-OCR") +@strict(accept_kwargs=True) class Glm4Config(PreTrainedConfig): r""" Example: @@ -50,51 +53,31 @@ class Glm4Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 151552, - hidden_size: int | None = 4096, - intermediate_size: int | None = 13696, - num_hidden_layers: int | None = 40, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 2, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - attention_dropout: float | None = 0.0, - max_position_embeddings: int | None = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 0.00000015625, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - pad_token_id: int | None = 151329, - eos_token_id: list[int] | None = [151329, 151336, 151338], - bos_token_id: int | None = None, - attention_bias: bool | None = True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC + vocab_size: int = 151552 + hidden_size: int = 4096 + intermediate_size: int = 13696 + num_hidden_layers: int = 40 + num_attention_heads: int = 32 + num_key_value_heads: int = 2 + head_dim: int = 128 + hidden_act: str = "silu" + attention_dropout: float | int = 0.0 + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 0.00000015625 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + pad_token_id: int | None = 151329 + eos_token_id: int | list[int] | None = None + bos_token_id: int | None = None + attention_bias: bool = True - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + def __post_init__(self, **kwargs): + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC + if self.eos_token_id is None: + self.eos_token_id = [151329, 151336, 151338] + super().__post_init__(**kwargs) __all__ = ["Glm4Config"] diff --git a/src/transformers/models/glm46v/configuration_glm46v.py b/src/transformers/models/glm46v/configuration_glm46v.py index 24c0a20a1182..aba781ae03a1 100644 --- a/src/transformers/models/glm46v/configuration_glm46v.py +++ b/src/transformers/models/glm46v/configuration_glm46v.py @@ -19,12 +19,15 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="zai-org/GLM-4.1V-9B-Thinking") +@strict(accept_kwargs=True) class Glm46VConfig(PreTrainedConfig): r""" image_start_token_id (`int`, *optional*, defaults to 151339): @@ -53,40 +56,30 @@ class Glm46VConfig(PreTrainedConfig): sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151343, - video_token_id=151344, - image_start_token_id=151339, - image_end_token_id=151340, - video_start_token_id=151361, - video_end_token_id=151362, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "glm4v_vision") - self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151343 + video_token_id: int = 151344 + image_start_token_id: int = 151339 + image_end_token_id: int = 151340 + video_start_token_id: int = 151361 + video_end_token_id: int = 151362 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "glm4v_vision") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: self.vision_config = CONFIG_MAPPING["glm4v_vision"]() - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "glm4v_text") - self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "glm4v_text") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: self.text_config = CONFIG_MAPPING["glm4v_text"]() - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.video_start_token_id = video_start_token_id - self.video_end_token_id = video_end_token_id - self.image_start_token_id = image_start_token_id - self.image_end_token_id = image_end_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Glm46VConfig"] diff --git a/src/transformers/models/glm46v/modular_glm46v.py b/src/transformers/models/glm46v/modular_glm46v.py index 190713ade44f..10bac94be0fe 100644 --- a/src/transformers/models/glm46v/modular_glm46v.py +++ b/src/transformers/models/glm46v/modular_glm46v.py @@ -14,6 +14,7 @@ import numpy as np +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -27,6 +28,7 @@ @auto_docstring(checkpoint="zai-org/GLM-4.1V-9B-Thinking") +@strict(accept_kwargs=True) class Glm46VConfig(PreTrainedConfig): r""" image_start_token_id (`int`, *optional*, defaults to 151339): @@ -55,40 +57,30 @@ class Glm46VConfig(PreTrainedConfig): sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151343, - video_token_id=151344, - image_start_token_id=151339, - image_end_token_id=151340, - video_start_token_id=151361, - video_end_token_id=151362, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "glm4v_vision") - self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151343 + video_token_id: int = 151344 + image_start_token_id: int = 151339 + image_end_token_id: int = 151340 + video_start_token_id: int = 151361 + video_end_token_id: int = 151362 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "glm4v_vision") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: self.vision_config = CONFIG_MAPPING["glm4v_vision"]() - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "glm4v_text") - self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "glm4v_text") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: self.text_config = CONFIG_MAPPING["glm4v_text"]() - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.video_start_token_id = video_start_token_id - self.video_end_token_id = video_end_token_id - self.image_start_token_id = image_start_token_id - self.image_end_token_id = image_end_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) class Glm46VPreTrainedModel(Glm4vPreTrainedModel): diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index 27d869713053..2599a59c3272 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -17,6 +17,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters @@ -24,6 +25,7 @@ @auto_docstring(checkpoint="zai-org/GLM-4.5") +@strict(accept_kwargs=True) class Glm4MoeConfig(PreTrainedConfig): r""" n_group (`int`, *optional*, defaults to 1): @@ -75,72 +77,38 @@ class Glm4MoeConfig(PreTrainedConfig): "num_local_experts": "n_routed_experts", } - def __init__( - self, - vocab_size: int | None = 151552, - hidden_size: int | None = 4096, - intermediate_size: int | None = 10944, - num_hidden_layers: int | None = 46, - num_attention_heads: int | None = 96, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - moe_intermediate_size: int | None = 1408, - num_experts_per_tok: int | None = 8, - n_shared_experts: int | None = 1, - n_routed_experts: int | None = 128, - routed_scaling_factor: float | None = 1.0, - n_group: int | None = 1, - topk_group: int | None = 1, - first_k_dense_replace: int | None = 1, - norm_topk_prob: bool | None = True, - use_qk_norm: bool | None = False, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters + vocab_size: int = 151552 + hidden_size: int = 4096 + intermediate_size: int = 10944 + num_hidden_layers: int = 46 + num_attention_heads: int = 96 + num_key_value_heads: int = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + moe_intermediate_size: int = 1408 + num_experts_per_tok: int = 8 + n_shared_experts: int = 1 + n_routed_experts: int = 128 + routed_scaling_factor: float = 1.0 + n_group: int = 1 + topk_group: int = 1 + first_k_dense_replace: int = 1 + norm_topk_prob: bool = True + use_qk_norm: bool = False + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC - - # MoE arguments - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.n_group = n_group - self.topk_group = topk_group - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.routed_scaling_factor = routed_scaling_factor - self.first_k_dense_replace = first_k_dense_replace - self.norm_topk_prob = norm_topk_prob - self.use_qk_norm = use_qk_norm - self.tie_word_embeddings = tie_word_embeddings - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Glm4MoeConfig"] diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index 26776dd46b9a..26e4c5e09024 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -14,6 +14,7 @@ """PyTorch GLM-4-MOE model.""" import torch +from huggingface_hub.dataclasses import strict from torch import nn from ...configuration_utils import PreTrainedConfig @@ -37,6 +38,7 @@ @auto_docstring(checkpoint="zai-org/GLM-4.5") +@strict(accept_kwargs=True) class Glm4MoeConfig(PreTrainedConfig): r""" n_group (`int`, *optional*, defaults to 1): @@ -88,72 +90,38 @@ class Glm4MoeConfig(PreTrainedConfig): "num_local_experts": "n_routed_experts", } - def __init__( - self, - vocab_size: int | None = 151552, - hidden_size: int | None = 4096, - intermediate_size: int | None = 10944, - num_hidden_layers: int | None = 46, - num_attention_heads: int | None = 96, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - moe_intermediate_size: int | None = 1408, - num_experts_per_tok: int | None = 8, - n_shared_experts: int | None = 1, - n_routed_experts: int | None = 128, - routed_scaling_factor: float | None = 1.0, - n_group: int | None = 1, - topk_group: int | None = 1, - first_k_dense_replace: int | None = 1, - norm_topk_prob: bool | None = True, - use_qk_norm: bool | None = False, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters + vocab_size: int = 151552 + hidden_size: int = 4096 + intermediate_size: int = 10944 + num_hidden_layers: int = 46 + num_attention_heads: int = 96 + num_key_value_heads: int = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + moe_intermediate_size: int = 1408 + num_experts_per_tok: int = 8 + n_shared_experts: int = 1 + n_routed_experts: int = 128 + routed_scaling_factor: float = 1.0 + n_group: int = 1 + topk_group: int = 1 + first_k_dense_replace: int = 1 + norm_topk_prob: bool = True + use_qk_norm: bool = False + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC - - # MoE arguments - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.n_group = n_group - self.topk_group = topk_group - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.routed_scaling_factor = routed_scaling_factor - self.first_k_dense_replace = first_k_dense_replace - self.norm_topk_prob = norm_topk_prob - self.use_qk_norm = use_qk_norm - self.tie_word_embeddings = tie_word_embeddings - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - - super().__init__(**kwargs) + super().__post_init__(**kwargs) class Glm4MoeRotaryEmbedding(GlmRotaryEmbedding): diff --git a/src/transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py b/src/transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py index 591fb808e8f9..c1a82f04315f 100644 --- a/src/transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +++ b/src/transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py @@ -19,12 +19,15 @@ # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="zai-org/GLM-4.5") +@strict(accept_kwargs=True) class Glm4MoeLiteConfig(PreTrainedConfig): r""" rope_interleave (`bool`, *optional*, defaults to `True`): @@ -67,90 +70,50 @@ class Glm4MoeLiteConfig(PreTrainedConfig): } attribute_map = { "num_local_experts": "n_routed_experts", + "head_dim": "qk_rope_head_dim", } - def __init__( - self, - vocab_size: int | None = 154880, - hidden_size: int | None = 2048, - intermediate_size: int | None = 10240, - moe_intermediate_size: int | None = 1536, - num_hidden_layers: int | None = 47, - num_attention_heads: int | None = 20, - num_key_value_heads: int | None = 20, - n_shared_experts: int | None = 1, - n_routed_experts: int | None = 64, - routed_scaling_factor: float | None = 1.8, - kv_lora_rank: int | None = 512, - q_lora_rank: int | None = 768, - qk_rope_head_dim: int | None = 64, - v_head_dim: int | None = 256, - qk_nope_head_dim: int | None = 192, - n_group: int | None = 1, - topk_group: int | None = 1, - num_experts_per_tok: int | None = 4, - norm_topk_prob: bool | None = True, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 202752, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 0, - eos_token_id: int | None = 1, - pretraining_tp: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - rope_interleave: bool | None = True, - mlp_layer_types=None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers + vocab_size: int = 154880 + hidden_size: int = 2048 + intermediate_size: int = 10240 + moe_intermediate_size: int = 1536 + num_hidden_layers: int = 47 + num_attention_heads: int = 20 + num_key_value_heads: int = 20 + n_shared_experts: int = 1 + n_routed_experts: int = 64 + routed_scaling_factor: float = 1.8 + kv_lora_rank: int = 512 + q_lora_rank: int = 768 + qk_rope_head_dim: int = 64 + v_head_dim: int = 256 + qk_nope_head_dim: int = 192 + n_group: int = 1 + topk_group: int = 1 + num_experts_per_tok: int = 4 + norm_topk_prob: bool = True + hidden_act: str = "silu" + max_position_embeddings: int = 202752 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + pretraining_tp: int = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + rope_interleave: bool = True + mlp_layer_types: list[str] | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + def __post_init__(self, **kwargs): # Default to MoE from the second layer and on - self.mlp_layer_types = mlp_layer_types if self.mlp_layer_types is None: self.mlp_layer_types = ["dense"] + ["sparse"] * (self.num_hidden_layers - 1) - layer_type_validation(self.mlp_layer_types, self.num_hidden_layers, attention=False) - - self.moe_intermediate_size = moe_intermediate_size - self.num_attention_heads = num_attention_heads - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.routed_scaling_factor = routed_scaling_factor - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim - self.head_dim = qk_rope_head_dim - self.n_group = n_group - self.topk_group = topk_group - self.num_experts_per_tok = num_experts_per_tok - self.norm_topk_prob = norm_topk_prob - self.rope_interleave = rope_interleave - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + super().__post_init__(**kwargs) __all__ = ["Glm4MoeLiteConfig"] diff --git a/src/transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py b/src/transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py index 504d11fb6435..1794df505be8 100644 --- a/src/transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +++ b/src/transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py @@ -14,8 +14,9 @@ import torch.nn as nn +from huggingface_hub.dataclasses import strict -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring from ..deepseek_v3.modeling_deepseek_v3 import DeepseekV3Attention @@ -34,6 +35,7 @@ @auto_docstring(checkpoint="zai-org/GLM-4.5") +@strict(accept_kwargs=True) class Glm4MoeLiteConfig(PreTrainedConfig): r""" rope_interleave (`bool`, *optional*, defaults to `True`): @@ -76,90 +78,50 @@ class Glm4MoeLiteConfig(PreTrainedConfig): } attribute_map = { "num_local_experts": "n_routed_experts", + "head_dim": "qk_rope_head_dim", } - def __init__( - self, - vocab_size: int | None = 154880, - hidden_size: int | None = 2048, - intermediate_size: int | None = 10240, - moe_intermediate_size: int | None = 1536, - num_hidden_layers: int | None = 47, - num_attention_heads: int | None = 20, - num_key_value_heads: int | None = 20, - n_shared_experts: int | None = 1, - n_routed_experts: int | None = 64, - routed_scaling_factor: float | None = 1.8, - kv_lora_rank: int | None = 512, - q_lora_rank: int | None = 768, - qk_rope_head_dim: int | None = 64, - v_head_dim: int | None = 256, - qk_nope_head_dim: int | None = 192, - n_group: int | None = 1, - topk_group: int | None = 1, - num_experts_per_tok: int | None = 4, - norm_topk_prob: bool | None = True, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 202752, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 0, - eos_token_id: int | None = 1, - pretraining_tp: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - rope_interleave: bool | None = True, - mlp_layer_types=None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - + vocab_size: int = 154880 + hidden_size: int = 2048 + intermediate_size: int = 10240 + moe_intermediate_size: int = 1536 + num_hidden_layers: int = 47 + num_attention_heads: int = 20 + num_key_value_heads: int = 20 + n_shared_experts: int = 1 + n_routed_experts: int = 64 + routed_scaling_factor: float = 1.8 + kv_lora_rank: int = 512 + q_lora_rank: int = 768 + qk_rope_head_dim: int = 64 + v_head_dim: int = 256 + qk_nope_head_dim: int = 192 + n_group: int = 1 + topk_group: int = 1 + num_experts_per_tok: int = 4 + norm_topk_prob: bool = True + hidden_act: str = "silu" + max_position_embeddings: int = 202752 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + pretraining_tp: int = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + rope_interleave: bool = True + mlp_layer_types: list[str] | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + + def __post_init__(self, **kwargs): # Default to MoE from the second layer and on - self.mlp_layer_types = mlp_layer_types if self.mlp_layer_types is None: self.mlp_layer_types = ["dense"] + ["sparse"] * (self.num_hidden_layers - 1) - layer_type_validation(self.mlp_layer_types, self.num_hidden_layers, attention=False) - - self.moe_intermediate_size = moe_intermediate_size - self.num_attention_heads = num_attention_heads - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.routed_scaling_factor = routed_scaling_factor - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim - self.head_dim = qk_rope_head_dim - self.n_group = n_group - self.topk_group = topk_group - self.num_experts_per_tok = num_experts_per_tok - self.norm_topk_prob = norm_topk_prob - self.rope_interleave = rope_interleave - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + super().__post_init__(**kwargs) class Glm4MoeLiteRotaryEmbedding(Glm4MoeRotaryEmbedding): diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index d1751f9ed1f5..341d9c59e207 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="zai-org/GLM-4.1V-9B-Thinking") +@strict(accept_kwargs=True) class Glm4vVisionConfig(PreTrainedConfig): r""" out_hidden_size (`int`, *optional*, defaults to 4096): @@ -46,45 +49,25 @@ class Glm4vVisionConfig(PreTrainedConfig): model_type = "glm4v_vision" base_config_key = "vision_config" - def __init__( - self, - depth=24, - hidden_size=1536, - hidden_act="silu", - attention_bias=False, - attention_dropout=0.0, - num_heads=12, - in_channels=3, - image_size=336, - patch_size=14, - rms_norm_eps=1e-05, - spatial_merge_size=2, - temporal_patch_size=2, - out_hidden_size=4096, - intermediate_size=13696, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.num_heads = num_heads - self.in_channels = in_channels - self.image_size = image_size - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.out_hidden_size = out_hidden_size - self.intermediate_size = intermediate_size - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout + depth: int = 24 + hidden_size: int = 1536 + hidden_act: str = "silu" + attention_bias: bool = False + attention_dropout: float | int = 0.0 + num_heads: int = 12 + in_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 336 + patch_size: int | list[int] | tuple[int, int] = 14 + rms_norm_eps: float = 1e-05 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + out_hidden_size: int = 4096 + intermediate_size: int = 13696 + initializer_range: float = 0.02 @auto_docstring(checkpoint="zai-org/GLM-4.1V-9B-Thinking") +@strict(accept_kwargs=True) class Glm4vTextConfig(PreTrainedConfig): r""" Example: @@ -119,49 +102,32 @@ class Glm4vTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - - def __init__( - self, - vocab_size: int | None = 151552, - hidden_size: int | None = 4096, - intermediate_size: int | None = 13696, - num_hidden_layers: int | None = 40, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 2, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-05, - use_cache: bool | None = True, - attention_dropout: float | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - - super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs) + ignore_keys_at_rope_validation = {"mrope_section"} + + vocab_size: int = 151552 + hidden_size: int = 4096 + intermediate_size: int = 13696 + num_hidden_layers: int = 40 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 2 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + use_cache: bool = True + attention_dropout: float | int = 0.0 + rope_parameters: RopeParameters | dict | None = None + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="zai-org/GLM-4.1V-9B-Thinking") +@strict(accept_kwargs=True) class Glm4vConfig(PreTrainedConfig): r""" image_start_token_id (`int`, *optional*, defaults to 151339): @@ -191,38 +157,28 @@ class Glm4vConfig(PreTrainedConfig): sub_configs = {"vision_config": Glm4vVisionConfig, "text_config": Glm4vTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151343, - video_token_id=151344, - image_start_token_id=151339, - image_end_token_id=151340, - video_start_token_id=151341, - video_end_token_id=151342, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: - self.vision_config = self.sub_configs["vision_config"]() - - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151343 + video_token_id: int = 151344 + image_start_token_id: int = 151339 + image_end_token_id: int = 151340 + video_start_token_id: int = 151341 + video_end_token_id: int = 151342 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: + self.vision_config = self.sub_configs["vision_config"](**kwargs) + + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: self.text_config = self.sub_configs["text_config"](**kwargs) - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.video_start_token_id = video_start_token_id - self.video_end_token_id = video_end_token_id - self.image_start_token_id = image_start_token_id - self.image_end_token_id = image_end_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Glm4vConfig", "Glm4vTextConfig", "Glm4vVisionConfig"] diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index f57b77bb99d1..7af59c1916c6 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -18,6 +18,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch.nn import LayerNorm from ... import initialization as init @@ -68,6 +69,7 @@ @auto_docstring(checkpoint="zai-org/GLM-4.1V-9B-Thinking") +@strict(accept_kwargs=True) class Glm4vVisionConfig(PreTrainedConfig): r""" out_hidden_size (`int`, *optional*, defaults to 4096): @@ -91,45 +93,25 @@ class Glm4vVisionConfig(PreTrainedConfig): model_type = "glm4v_vision" base_config_key = "vision_config" - def __init__( - self, - depth=24, - hidden_size=1536, - hidden_act="silu", - attention_bias=False, - attention_dropout=0.0, - num_heads=12, - in_channels=3, - image_size=336, - patch_size=14, - rms_norm_eps=1e-05, - spatial_merge_size=2, - temporal_patch_size=2, - out_hidden_size=4096, - intermediate_size=13696, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.num_heads = num_heads - self.in_channels = in_channels - self.image_size = image_size - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.out_hidden_size = out_hidden_size - self.intermediate_size = intermediate_size - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout + depth: int = 24 + hidden_size: int = 1536 + hidden_act: str = "silu" + attention_bias: bool = False + attention_dropout: float | int = 0.0 + num_heads: int = 12 + in_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 336 + patch_size: int | list[int] | tuple[int, int] = 14 + rms_norm_eps: float = 1e-05 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + out_hidden_size: int = 4096 + intermediate_size: int = 13696 + initializer_range: float = 0.02 @auto_docstring(checkpoint="zai-org/GLM-4.1V-9B-Thinking") +@strict(accept_kwargs=True) class Glm4vTextConfig(PreTrainedConfig): r""" Example: @@ -164,49 +146,32 @@ class Glm4vTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - - def __init__( - self, - vocab_size: int | None = 151552, - hidden_size: int | None = 4096, - intermediate_size: int | None = 13696, - num_hidden_layers: int | None = 40, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 2, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-05, - use_cache: bool | None = True, - attention_dropout: float | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - - super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs) + ignore_keys_at_rope_validation = {"mrope_section"} + + vocab_size: int = 151552 + hidden_size: int = 4096 + intermediate_size: int = 13696 + num_hidden_layers: int = 40 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 2 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + use_cache: bool = True + attention_dropout: float | int = 0.0 + rope_parameters: RopeParameters | dict | None = None + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="zai-org/GLM-4.1V-9B-Thinking") +@strict(accept_kwargs=True) class Glm4vConfig(PreTrainedConfig): r""" image_start_token_id (`int`, *optional*, defaults to 151339): @@ -236,38 +201,28 @@ class Glm4vConfig(PreTrainedConfig): sub_configs = {"vision_config": Glm4vVisionConfig, "text_config": Glm4vTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151343, - video_token_id=151344, - image_start_token_id=151339, - image_end_token_id=151340, - video_start_token_id=151341, - video_end_token_id=151342, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: - self.vision_config = self.sub_configs["vision_config"]() - - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151343 + video_token_id: int = 151344 + image_start_token_id: int = 151339 + image_end_token_id: int = 151340 + video_start_token_id: int = 151341 + video_end_token_id: int = 151342 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: + self.vision_config = self.sub_configs["vision_config"](**kwargs) + + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: self.text_config = self.sub_configs["text_config"](**kwargs) - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.video_start_token_id = video_start_token_id - self.video_end_token_id = video_end_token_id - self.image_start_token_id = image_start_token_id - self.image_end_token_id = image_end_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) # Will be used for both Text and Vision modalities diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index c2c767b7c197..dd6ab4966282 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="zai-org/GLM-4.5V") +@strict(accept_kwargs=True) class Glm4vMoeTextConfig(PreTrainedConfig): r""" n_group (`int`, *optional*, defaults to 1): @@ -66,74 +69,45 @@ class Glm4vMoeTextConfig(PreTrainedConfig): attribute_map = { "num_local_experts": "n_routed_experts", } + + vocab_size: int = 151424 + hidden_size: int = 4096 + intermediate_size: int = 10944 + num_hidden_layers: int = 46 + num_attention_heads: int = 96 + num_key_value_heads: int = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 65536 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = True + attention_dropout: float | int = 0.0 + moe_intermediate_size: int = 1408 + num_experts_per_tok: int = 8 + n_shared_experts: int = 1 + n_routed_experts: int = 128 + routed_scaling_factor: float = 1.0 + n_group: int = 1 + topk_group: int = 1 + first_k_dense_replace: int = 1 + norm_topk_prob: bool = True + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + pad_token_id: int | None = None base_config_key = "text_config" + ignore_keys_at_rope_validation = {"mrope_section"} + router_aux_loss_coef: float = 0.0001 - def __init__( - self, - vocab_size: int | None = 151424, - hidden_size: int | None = 4096, - intermediate_size: int | None = 10944, - num_hidden_layers: int | None = 46, - num_attention_heads: int | None = 96, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 65536, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = True, - attention_dropout: float | None = 0.0, - moe_intermediate_size: int | None = 1408, - num_experts_per_tok: int | None = 8, - n_shared_experts: int | None = 1, - n_routed_experts: int | None = 128, - routed_scaling_factor: float | None = 1.0, - n_group: int | None = 1, - topk_group: int | None = 1, - first_k_dense_replace: int | None = 1, - norm_topk_prob: bool | None = True, - pad_token_id: int | None = None, - eos_token_id: int | None = None, - bos_token_id: int | None = None, - router_aux_loss_coef: float | None = 0.0001, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.bos_token_id = bos_token_id - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters + def __post_init__(self, **kwargs): kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC - - # MoE arguments - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.n_group = n_group - self.topk_group = topk_group - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.routed_scaling_factor = routed_scaling_factor - self.first_k_dense_replace = first_k_dense_replace - self.norm_topk_prob = norm_topk_prob - self.router_aux_loss_coef = router_aux_loss_coef - super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="zai-org/GLM-4.1V-9B-Thinking") +@strict(accept_kwargs=True) class Glm4vMoeVisionConfig(PreTrainedConfig): r""" out_hidden_size (`int`, *optional*, defaults to 4096): @@ -157,45 +131,25 @@ class Glm4vMoeVisionConfig(PreTrainedConfig): model_type = "glm4v_moe_vision" base_config_key = "vision_config" - def __init__( - self, - depth=24, - hidden_size=1536, - hidden_act="silu", - attention_bias=False, - attention_dropout=0.0, - num_heads=12, - in_channels=3, - image_size=336, - patch_size=14, - rms_norm_eps=1e-05, - spatial_merge_size=2, - temporal_patch_size=2, - out_hidden_size=4096, - intermediate_size=13696, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.num_heads = num_heads - self.in_channels = in_channels - self.image_size = image_size - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.out_hidden_size = out_hidden_size - self.intermediate_size = intermediate_size - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout + depth: int = 24 + hidden_size: int = 1536 + hidden_act: str = "silu" + attention_bias: bool = False + attention_dropout: float | int = 0.0 + num_heads: int = 12 + in_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 336 + patch_size: int | list[int] | tuple[int, int] = 14 + rms_norm_eps: float = 1e-05 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + out_hidden_size: int = 4096 + intermediate_size: int = 13696 + initializer_range: float = 0.02 @auto_docstring(checkpoint="zai-org/GLM-4.5V") +@strict(accept_kwargs=True) class Glm4vMoeConfig(PreTrainedConfig): r""" image_start_token_id (`int`, *optional*, defaults to 151339): @@ -224,38 +178,29 @@ class Glm4vMoeConfig(PreTrainedConfig): sub_configs = {"vision_config": Glm4vMoeVisionConfig, "text_config": Glm4vMoeTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151363, - video_token_id=151364, - image_start_token_id=151339, - image_end_token_id=151340, - video_start_token_id=151341, - video_end_token_id=151342, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: - self.vision_config = self.sub_configs["vision_config"]() - - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + + image_token_id: int = 151363 + video_token_id: int = 151364 + image_start_token_id: int = 151339 + image_end_token_id: int = 151340 + video_start_token_id: int = 151341 + video_end_token_id: int = 151342 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: + self.vision_config = self.sub_configs["vision_config"](**kwargs) + + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: self.text_config = self.sub_configs["text_config"](**kwargs) - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.video_start_token_id = video_start_token_id - self.video_end_token_id = video_end_token_id - self.image_start_token_id = image_start_token_id - self.image_end_token_id = image_end_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Glm4vMoeConfig", "Glm4vMoeVisionConfig", "Glm4vMoeTextConfig"] diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index bee1dcd16a20..238082786da9 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -15,14 +15,13 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, logging @@ -57,6 +56,7 @@ @auto_docstring(checkpoint="zai-org/GLM-4.5V") +@strict(accept_kwargs=True) class Glm4vMoeTextConfig(Glm4MoeConfig): r""" n_group (`int`, *optional*, defaults to 1): @@ -98,73 +98,20 @@ class Glm4vMoeTextConfig(Glm4MoeConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + ignore_keys_at_rope_validation = {"mrope_section"} - def __init__( - self, - vocab_size: int | None = 151424, - hidden_size: int | None = 4096, - intermediate_size: int | None = 10944, - num_hidden_layers: int | None = 46, - num_attention_heads: int | None = 96, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 65536, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = True, - attention_dropout: float | None = 0.0, - moe_intermediate_size: int | None = 1408, - num_experts_per_tok: int | None = 8, - n_shared_experts: int | None = 1, - n_routed_experts: int | None = 128, - routed_scaling_factor: float | None = 1.0, - n_group: int | None = 1, - topk_group: int | None = 1, - first_k_dense_replace: int | None = 1, - norm_topk_prob: bool | None = True, - pad_token_id: int | None = None, - eos_token_id: int | None = None, - bos_token_id: int | None = None, - router_aux_loss_coef: float | None = 0.0001, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.bos_token_id = bos_token_id - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC - - # MoE arguments - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.n_group = n_group - self.topk_group = topk_group - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.routed_scaling_factor = routed_scaling_factor - self.first_k_dense_replace = first_k_dense_replace - self.norm_topk_prob = norm_topk_prob - self.router_aux_loss_coef = router_aux_loss_coef - PreTrainedConfig.__init__(self, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs) + vocab_size: int = 151424 + max_position_embeddings: int = 65536 + attention_bias: bool = True + router_aux_loss_coef: float = 0.0001 + use_qk_norm = AttributeError() + + def __post_init__(self, **kwargs): + super().__post_init__(self, **kwargs) @auto_docstring(checkpoint="zai-org/GLM-4.5V") +@strict(accept_kwargs=True) class Glm4vMoeConfig(Glm4vConfig): r""" image_start_token_id (`int`, *optional*, defaults to 151339): @@ -189,20 +136,8 @@ class Glm4vMoeConfig(Glm4vConfig): >>> configuration = model.config ```""" - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151363, - video_token_id=151364, - image_start_token_id=151339, - image_end_token_id=151340, - video_start_token_id=151341, - video_end_token_id=151342, - tie_word_embeddings=False, - **kwargs, - ): - super().__init__() + image_token_id: int = 151363 + video_token_id: int = 151364 class Glm4vMoeTextAttention(Glm4Attention): diff --git a/src/transformers/models/glm_image/configuration_glm_image.py b/src/transformers/models/glm_image/configuration_glm_image.py index 2e387660a4b9..f8a5cefcf24e 100644 --- a/src/transformers/models/glm_image/configuration_glm_image.py +++ b/src/transformers/models/glm_image/configuration_glm_image.py @@ -18,12 +18,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="zai-org/GLM-Image") +@strict(accept_kwargs=True) class GlmImageVQVAEConfig(PreTrainedConfig): r""" num_embeddings (`int`, *optional*, defaults to 16384): @@ -33,24 +36,15 @@ class GlmImageVQVAEConfig(PreTrainedConfig): model_type = "glm_image_vqmodel" base_config_key = "vq_config" - def __init__( - self, - embed_dim: int = 2048, - num_embeddings: int = 16384, - latent_channels: int = 1536, - in_channels: int = 3, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.embed_dim = embed_dim - self.num_embeddings = num_embeddings - self.latent_channels = latent_channels - self.in_channels = in_channels - self.initializer_range = initializer_range + embed_dim: int = 2048 + num_embeddings: int = 16384 + latent_channels: int = 1536 + in_channels: int = 3 + initializer_range: float = 0.02 @auto_docstring(checkpoint="zai-org/GLM-Image") +@strict(accept_kwargs=True) class GlmImageVisionConfig(PreTrainedConfig): r""" out_hidden_size (`int`, *optional*, defaults to 4096): @@ -74,41 +68,23 @@ class GlmImageVisionConfig(PreTrainedConfig): model_type = "glm_image_vision" base_config_key = "vision_config" - def __init__( - self, - depth=40, - hidden_size=1536, - hidden_act="gelu", - attention_bias=True, - attention_dropout=0.0, - num_heads=16, - in_channels=3, - image_size=2048, - patch_size=16, - layer_norm_eps=1e-06, - spatial_merge_size=1, - intermediate_size=6144, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.num_heads = num_heads - self.in_channels = in_channels - self.image_size = image_size - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.intermediate_size = intermediate_size - self.initializer_range = initializer_range - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps + depth: int = 40 + hidden_size: int = 1536 + hidden_act: str = "gelu" + attention_bias: bool = True + attention_dropout: float | int = 0.0 + num_heads: int = 16 + in_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 2048 + patch_size: int | list[int] | tuple[int, int] = 16 + spatial_merge_size: int = 1 + intermediate_size: int = 6144 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-06 @auto_docstring(checkpoint="zai-org/GLM-Image") +@strict(accept_kwargs=True) class GlmImageTextConfig(PreTrainedConfig): r""" vision_vocab_size (`int`, *optional*, defaults to 16512): @@ -147,55 +123,35 @@ class GlmImageTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - - def __init__( - self, - vocab_size: int = 168064, - hidden_size: int | None = 4096, - intermediate_size: int | None = 13696, - num_hidden_layers: int | None = 40, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 2, - hidden_act: str | None = "silu", - max_position_embeddings: int = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-05, - use_cache: bool | None = True, - attention_dropout: float | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - pad_token_id: int = 167841, - vision_vocab_size: int = 16512, - attention_bias: bool = True, - eos_token_id: int = 16385, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - - super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs) - self.vision_vocab_size = vision_vocab_size - self.attention_bias = attention_bias - self.eos_token_id = eos_token_id + ignore_keys_at_rope_validation = {"mrope_section"} + + vocab_size: int = 168064 + hidden_size: int = 4096 + intermediate_size: int = 13696 + num_hidden_layers: int = 40 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 2 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + use_cache: bool = True + attention_dropout: float | int = 0.0 + rope_parameters: RopeParameters | dict | None = None + pad_token_id: int = 167841 + vision_vocab_size: int = 16512 + attention_bias: bool = True + eos_token_id: int | list[int] | None = 16385 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="zai-org/GLM-Image") +@strict(accept_kwargs=True) class GlmImageConfig(PreTrainedConfig): r""" image_start_token_id (`int`, *optional*, defaults to 16384): @@ -224,40 +180,31 @@ class GlmImageConfig(PreTrainedConfig): } keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - vq_config=None, - image_token_id=167855, - image_start_token_id=16384, - image_end_token_id=16385, - tie_word_embeddings: bool | None = False, - **kwargs, - ): - if isinstance(vision_config, dict): - vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: - vision_config = self.sub_configs["vision_config"](**kwargs) - - if isinstance(vq_config, dict): - vq_config = self.sub_configs["vq_config"](**vq_config) - elif vq_config is None: - vq_config = self.sub_configs["vq_config"](**kwargs) - - if isinstance(text_config, dict): - text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: - text_config = self.sub_configs["text_config"](**kwargs) - - self.image_token_id = image_token_id - self.image_start_token_id = image_start_token_id - self.image_end_token_id = image_end_token_id - self.text_config = text_config - self.vision_config = vision_config - self.vq_config = vq_config - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + vq_config: dict | PreTrainedConfig | None = None + image_token_id: int = 167855 + image_start_token_id: int = 16384 + image_end_token_id: int = 16385 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: + self.vision_config = self.sub_configs["vision_config"](**kwargs) + + if isinstance(self.vq_config, dict): + self.vq_config = self.sub_configs["vq_config"](**self.vq_config) + elif self.vq_config is None: + self.vq_config = self.sub_configs["vq_config"](**kwargs) + + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: + self.text_config = self.sub_configs["text_config"](**kwargs) + + super().__post_init__(**kwargs) __all__ = ["GlmImageVQVAEConfig", "GlmImageVisionConfig", "GlmImageTextConfig", "GlmImageConfig"] diff --git a/src/transformers/models/glm_image/modular_glm_image.py b/src/transformers/models/glm_image/modular_glm_image.py index bfdcbfbac514..05a5cac81493 100644 --- a/src/transformers/models/glm_image/modular_glm_image.py +++ b/src/transformers/models/glm_image/modular_glm_image.py @@ -19,6 +19,7 @@ import numpy as np import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig @@ -60,6 +61,7 @@ @auto_docstring(checkpoint="zai-org/GLM-Image") +@strict(accept_kwargs=True) class GlmImageVQVAEConfig(PreTrainedConfig): r""" num_embeddings (`int`, *optional*, defaults to 16384): @@ -69,53 +71,36 @@ class GlmImageVQVAEConfig(PreTrainedConfig): model_type = "glm_image_vqmodel" base_config_key = "vq_config" - def __init__( - self, - embed_dim: int = 2048, - num_embeddings: int = 16384, - latent_channels: int = 1536, - in_channels: int = 3, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.embed_dim = embed_dim - self.num_embeddings = num_embeddings - self.latent_channels = latent_channels - self.in_channels = in_channels - self.initializer_range = initializer_range + embed_dim: int = 2048 + num_embeddings: int = 16384 + latent_channels: int = 1536 + in_channels: int = 3 + initializer_range: float = 0.02 @auto_docstring(checkpoint="zai-org/GLM-Image") +@strict(accept_kwargs=True) class GlmImageVisionConfig(Glm4vVisionConfig): model_type = "glm_image_vision" base_config_key = "vision_config" - def __init__( - self, - depth=40, - hidden_size=1536, - hidden_act="gelu", - attention_bias=True, - attention_dropout=0.0, - num_heads=16, - in_channels=3, - image_size=2048, - patch_size=16, - layer_norm_eps=1e-06, - spatial_merge_size=1, - intermediate_size=6144, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - del self.out_hidden_size - del self.rms_norm_eps - del self.temporal_patch_size - self.layer_norm_eps = layer_norm_eps + depth: int = 40 + hidden_act: str = "gelu" + attention_bias: bool = True + num_heads: int = 16 + image_size: int | list[int] | tuple[int, int] = 2048 + patch_size: int | list[int] | tuple[int, int] = 16 + layer_norm_eps: float = 1e-06 + spatial_merge_size: int = 1 + intermediate_size: int = 6144 + + out_hidden_size = AttributeError() + rms_norm_eps = AttributeError() + temporal_patch_size = AttributeError() @auto_docstring(checkpoint="zai-org/GLM-Image") +@strict(accept_kwargs=True) class GlmImageTextConfig(Glm4vTextConfig): r""" vision_vocab_size (`int`, *optional*, defaults to 16512): @@ -137,28 +122,16 @@ class GlmImageTextConfig(Glm4vTextConfig): >>> configuration = model.config ```""" - def __init__( - self, - vocab_size: int = 168064, - max_position_embeddings: int = 131072, - vision_vocab_size: int = 16512, - attention_bias: bool = True, - pad_token_id: int = 167841, - eos_token_id: int = 16385, - **super_kwargs, - ): - super().__init__( - vocab_size=vocab_size, - max_position_embeddings=max_position_embeddings, - pad_token_id=pad_token_id, - **super_kwargs, - ) - self.vision_vocab_size = vision_vocab_size - self.attention_bias = attention_bias - self.eos_token_id = eos_token_id + vocab_size: int = 168064 + max_position_embeddings: int = 131072 + vision_vocab_size: int = 16512 + attention_bias: bool = True + pad_token_id: int = 167841 + eos_token_id: int | list[int] | None = 16385 @auto_docstring(checkpoint="zai-org/GLM-Image") +@strict(accept_kwargs=True) class GlmImageConfig(PreTrainedConfig): r""" image_start_token_id (`int`, *optional*, defaults to 16384): @@ -187,40 +160,31 @@ class GlmImageConfig(PreTrainedConfig): } keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - vq_config=None, - image_token_id=167855, - image_start_token_id=16384, - image_end_token_id=16385, - tie_word_embeddings: bool | None = False, - **kwargs, - ): - if isinstance(vision_config, dict): - vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: - vision_config = self.sub_configs["vision_config"](**kwargs) - - if isinstance(vq_config, dict): - vq_config = self.sub_configs["vq_config"](**vq_config) - elif vq_config is None: - vq_config = self.sub_configs["vq_config"](**kwargs) - - if isinstance(text_config, dict): - text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: - text_config = self.sub_configs["text_config"](**kwargs) - - self.image_token_id = image_token_id - self.image_start_token_id = image_start_token_id - self.image_end_token_id = image_end_token_id - self.text_config = text_config - self.vision_config = vision_config - self.vq_config = vq_config - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + vq_config: dict | PreTrainedConfig | None = None + image_token_id: int = 167855 + image_start_token_id: int = 16384 + image_end_token_id: int = 16385 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: + self.vision_config = self.sub_configs["vision_config"](**kwargs) + + if isinstance(self.vq_config, dict): + self.vq_config = self.sub_configs["vq_config"](**self.vq_config) + elif self.vq_config is None: + self.vq_config = self.sub_configs["vq_config"](**kwargs) + + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: + self.text_config = self.sub_configs["text_config"](**kwargs) + + super().__post_init__(**kwargs) class GlmImageVisionMLP(SiglipMLP): diff --git a/src/transformers/models/glm_moe_dsa/configuration_glm_moe_dsa.py b/src/transformers/models/glm_moe_dsa/configuration_glm_moe_dsa.py index 5a8d95fe222e..88a0b3314afd 100644 --- a/src/transformers/models/glm_moe_dsa/configuration_glm_moe_dsa.py +++ b/src/transformers/models/glm_moe_dsa/configuration_glm_moe_dsa.py @@ -18,13 +18,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="zai-org/GLM-5") +@strict(accept_kwargs=True) class GlmMoeDsaConfig(PreTrainedConfig): r""" n_group (`int`, *optional*, defaults to 1): @@ -79,102 +81,55 @@ class GlmMoeDsaConfig(PreTrainedConfig): } attribute_map = { "num_local_experts": "n_routed_experts", + "head_dim": "qk_rope_head_dim", } - def __init__( - self, - vocab_size: int | None = 154880, - hidden_size: int | None = 6144, - intermediate_size: int | None = 12288, - moe_intermediate_size: int | None = 2048, - num_hidden_layers: int | None = 78, - num_attention_heads: int | None = 64, - num_key_value_heads: int | None = 64, - n_shared_experts: int | None = 1, - n_routed_experts: int | None = 256, - routed_scaling_factor: float | None = 2.5, - kv_lora_rank: int | None = 512, - q_lora_rank: int | None = 2048, - qk_rope_head_dim: int | None = 64, - qk_nope_head_dim: int | None = 192, - v_head_dim: int | None = 256, - n_group: int | None = 1, - topk_group: int | None = 1, - num_experts_per_tok: int | None = 8, - norm_topk_prob: bool | None = True, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 202752, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 0, - eos_token_id: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - mlp_layer_types=None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - index_topk: int | None = 2048, - index_head_dim: int | None = 128, - index_n_heads: int | None = 32, - **kwargs, - ): - # Model dimensions - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.moe_intermediate_size = moe_intermediate_size - self.num_hidden_layers = num_hidden_layers - self.max_position_embeddings = max_position_embeddings - - # Attention dimensions (MLA) - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.qk_rope_head_dim = qk_rope_head_dim - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim - self.v_head_dim = v_head_dim - self.head_dim = qk_rope_head_dim - - # MoE parameters - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.routed_scaling_factor = routed_scaling_factor - self.n_group = n_group - self.topk_group = topk_group - self.num_experts_per_tok = num_experts_per_tok - self.norm_topk_prob = norm_topk_prob + vocab_size: int = 154880 + + hidden_size: int = 6144 + intermediate_size: int = 12288 + moe_intermediate_size: int = 2048 + num_hidden_layers: int = 78 + num_attention_heads: int = 64 + num_key_value_heads: int = 64 + n_shared_experts: int = 1 + n_routed_experts: int = 256 + routed_scaling_factor: float = 2.5 + kv_lora_rank: int = 512 + q_lora_rank: int = 2048 + qk_rope_head_dim: int = 64 + v_head_dim: int = 256 + qk_nope_head_dim: int = 192 + n_group: int = 1 + topk_group: int = 1 + num_experts_per_tok: int = 8 + norm_topk_prob: bool = True + hidden_act: str = "silu" + max_position_embeddings: int = 202752 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + mlp_layer_types: list[str] | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + index_topk: int = 2048 + index_head_dim: int = 128 + index_n_heads: int = 32 + + def __post_init__(self, **kwargs): + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim # MLP layer types: first 3 dense, rest sparse - self.mlp_layer_types = mlp_layer_types if self.mlp_layer_types is None: - self.mlp_layer_types = ["dense"] * min(3, num_hidden_layers) + ["sparse"] * (num_hidden_layers - 3) - layer_type_validation(self.mlp_layer_types, self.num_hidden_layers, attention=False) - - # Indexer (DSA) parameters - self.index_topk = index_topk - self.index_head_dim = index_head_dim - self.index_n_heads = index_n_heads - - # General config - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + self.mlp_layer_types = ["dense"] * min(3, self.num_hidden_layers) + ["sparse"] * ( + self.num_hidden_layers - 3 + ) + super().__post_init__(**kwargs) __all__ = ["GlmMoeDsaConfig"] diff --git a/src/transformers/models/glm_moe_dsa/modeling_glm_moe_dsa.py b/src/transformers/models/glm_moe_dsa/modeling_glm_moe_dsa.py index e1a2f4dd5b7f..d5ce799cd408 100644 --- a/src/transformers/models/glm_moe_dsa/modeling_glm_moe_dsa.py +++ b/src/transformers/models/glm_moe_dsa/modeling_glm_moe_dsa.py @@ -18,7 +18,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - from collections.abc import Callable from typing import Optional diff --git a/src/transformers/models/glm_moe_dsa/modular_glm_moe_dsa.py b/src/transformers/models/glm_moe_dsa/modular_glm_moe_dsa.py index 96ff01d0cc9e..02dd6074b869 100644 --- a/src/transformers/models/glm_moe_dsa/modular_glm_moe_dsa.py +++ b/src/transformers/models/glm_moe_dsa/modular_glm_moe_dsa.py @@ -12,17 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. - from collections.abc import Callable import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ...cache_utils import Cache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...modeling_flash_attention_utils import FlashAttentionKwargs -from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...models.llama.modeling_llama import rotate_half from ...processing_utils import Unpack @@ -34,6 +33,7 @@ Glm4MoePreTrainedModel, Glm4MoeRMSNorm, ) +from ..glm4_moe_lite.configuration_glm4_moe_lite import Glm4MoeLiteConfig from ..glm4_moe_lite.modeling_glm4_moe_lite import ( Glm4MoeLiteDecoderLayer, eager_attention_forward, @@ -75,7 +75,8 @@ def apply_rotary_pos_emb( @auto_docstring(checkpoint="zai-org/GLM-5") -class GlmMoeDsaConfig(PreTrainedConfig): +@strict(accept_kwargs=True) +class GlmMoeDsaConfig(Glm4MoeLiteConfig): r""" n_group (`int`, *optional*, defaults to 1): Number of groups for routed experts. @@ -104,9 +105,6 @@ class GlmMoeDsaConfig(PreTrainedConfig): >>> configuration = model.config ```""" - model_type = "glm_moe_dsa" - keys_to_ignore_at_inference = ["past_key_values"] - base_model_tp_plan = { "layers.*.self_attn.q_b_proj": "colwise", "layers.*.self_attn.kv_a_proj_with_mqa": "mla_kv_a_proj", @@ -122,109 +120,32 @@ class GlmMoeDsaConfig(PreTrainedConfig): "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", } - base_model_pp_plan = { - "embed_tokens": (["input_ids"], ["inputs_embeds"]), - "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), - "norm": (["hidden_states"], ["hidden_states"]), - } - attribute_map = { - "num_local_experts": "n_routed_experts", - } - def __init__( - self, - vocab_size: int | None = 154880, - hidden_size: int | None = 6144, - intermediate_size: int | None = 12288, - moe_intermediate_size: int | None = 2048, - num_hidden_layers: int | None = 78, - num_attention_heads: int | None = 64, - num_key_value_heads: int | None = 64, - n_shared_experts: int | None = 1, - n_routed_experts: int | None = 256, - routed_scaling_factor: float | None = 2.5, - kv_lora_rank: int | None = 512, - q_lora_rank: int | None = 2048, - qk_rope_head_dim: int | None = 64, - qk_nope_head_dim: int | None = 192, - v_head_dim: int | None = 256, - n_group: int | None = 1, - topk_group: int | None = 1, - num_experts_per_tok: int | None = 8, - norm_topk_prob: bool | None = True, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 202752, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 0, - eos_token_id: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - mlp_layer_types=None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - index_topk: int | None = 2048, - index_head_dim: int | None = 128, - index_n_heads: int | None = 32, - **kwargs, - ): - # Model dimensions - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.moe_intermediate_size = moe_intermediate_size - self.num_hidden_layers = num_hidden_layers - self.max_position_embeddings = max_position_embeddings - - # Attention dimensions (MLA) - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.qk_rope_head_dim = qk_rope_head_dim - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim - self.v_head_dim = v_head_dim - self.head_dim = qk_rope_head_dim - - # MoE parameters - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.routed_scaling_factor = routed_scaling_factor - self.n_group = n_group - self.topk_group = topk_group - self.num_experts_per_tok = num_experts_per_tok - self.norm_topk_prob = norm_topk_prob + hidden_size: int = 6144 + intermediate_size: int = 12288 + moe_intermediate_size: int = 2048 + num_hidden_layers: int = 78 + num_attention_heads: int = 64 + num_key_value_heads: int = 64 + n_routed_experts: int = 256 + routed_scaling_factor: float = 2.5 + q_lora_rank: int = 2048 + num_experts_per_tok: int = 8 + index_topk: int = 2048 + index_head_dim: int = 128 + index_n_heads: int = 32 + pretraining_tp = AttributeError() + rope_interleave = AttributeError() + + def __post_init__(self, **kwargs): + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim # MLP layer types: first 3 dense, rest sparse - self.mlp_layer_types = mlp_layer_types if self.mlp_layer_types is None: - self.mlp_layer_types = ["dense"] * min(3, num_hidden_layers) + ["sparse"] * (num_hidden_layers - 3) - layer_type_validation(self.mlp_layer_types, self.num_hidden_layers, attention=False) - - # Indexer (DSA) parameters - self.index_topk = index_topk - self.index_head_dim = index_head_dim - self.index_n_heads = index_n_heads - - # General config - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + self.mlp_layer_types = ["dense"] * min(3, self.num_hidden_layers) + ["sparse"] * ( + self.num_hidden_layers - 3 + ) + PreTrainedConfig.__post_init__(self, **kwargs) class GlmMoeDsaRMSNorm(Glm4MoeRMSNorm): diff --git a/src/transformers/models/glm_ocr/configuration_glm_ocr.py b/src/transformers/models/glm_ocr/configuration_glm_ocr.py index 1aa30d495de5..404817a4d0e2 100644 --- a/src/transformers/models/glm_ocr/configuration_glm_ocr.py +++ b/src/transformers/models/glm_ocr/configuration_glm_ocr.py @@ -18,12 +18,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="zai-org/GLM-OCR") +@strict(accept_kwargs=True) class GlmOcrVisionConfig(PreTrainedConfig): r""" out_hidden_size (`int`, *optional*, defaults to 4096): @@ -47,45 +50,25 @@ class GlmOcrVisionConfig(PreTrainedConfig): model_type = "glm_ocr_vision" base_config_key = "vision_config" - def __init__( - self, - depth=24, - hidden_size=1024, - hidden_act="silu", - attention_bias=True, - attention_dropout=0.0, - num_heads=16, - in_channels=3, - image_size=336, - patch_size=14, - rms_norm_eps=1e-05, - spatial_merge_size=2, - temporal_patch_size=2, - out_hidden_size=1536, - intermediate_size=4096, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.num_heads = num_heads - self.in_channels = in_channels - self.image_size = image_size - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.out_hidden_size = out_hidden_size - self.intermediate_size = intermediate_size - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout + depth: int = 24 + hidden_size: int = 1024 + hidden_act: str = "silu" + attention_bias: bool = True + attention_dropout: float | int = 0.0 + num_heads: int = 16 + in_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 336 + patch_size: int | list[int] | tuple[int, int] = 14 + rms_norm_eps: float = 1e-05 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + out_hidden_size: int = 1536 + intermediate_size: int = 4096 + initializer_range: float = 0.02 @auto_docstring(checkpoint="zai-org/GLM-OCR") +@strict(accept_kwargs=True) class GlmOcrTextConfig(PreTrainedConfig): r""" Example: @@ -120,49 +103,32 @@ class GlmOcrTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - - def __init__( - self, - vocab_size: int | None = 59392, - hidden_size: int | None = 1024, - intermediate_size: int | None = 4096, - num_hidden_layers: int | None = 16, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-05, - use_cache: bool | None = True, - attention_dropout: float | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - - super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs) + ignore_keys_at_rope_validation = {"mrope_section"} + + vocab_size: int = 59392 + hidden_size: int = 1024 + intermediate_size: int = 4096 + num_hidden_layers: int = 16 + num_attention_heads: int = 16 + num_key_value_heads: int = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + use_cache: bool = True + attention_dropout: float | int = 0.0 + rope_parameters: RopeParameters | dict | None = None + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="zai-org/GLM-OCR") +@strict(accept_kwargs=True) class GlmOcrConfig(PreTrainedConfig): r""" image_start_token_id (`int`, *optional*, defaults to 59256): @@ -191,38 +157,29 @@ class GlmOcrConfig(PreTrainedConfig): sub_configs = {"vision_config": GlmOcrVisionConfig, "text_config": GlmOcrTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=59280, - video_token_id=59281, - image_start_token_id=59256, - image_end_token_id=59257, - video_start_token_id=59258, - video_end_token_id=59259, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: - self.vision_config = self.sub_configs["vision_config"]() - - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + + image_token_id: int = 59280 + video_token_id: int = 59281 + image_start_token_id: int = 59256 + image_end_token_id: int = 59257 + video_start_token_id: int = 59258 + video_end_token_id: int = 59259 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: + self.vision_config = self.sub_configs["vision_config"](**kwargs) + + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: self.text_config = self.sub_configs["text_config"](**kwargs) - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.video_start_token_id = video_start_token_id - self.video_end_token_id = video_end_token_id - self.image_start_token_id = image_start_token_id - self.image_end_token_id = image_end_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["GlmOcrConfig", "GlmOcrTextConfig", "GlmOcrVisionConfig"] diff --git a/src/transformers/models/glm_ocr/modular_glm_ocr.py b/src/transformers/models/glm_ocr/modular_glm_ocr.py index 21969cd129ca..30dd5fafeffd 100644 --- a/src/transformers/models/glm_ocr/modular_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modular_glm_ocr.py @@ -17,6 +17,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ...modeling_outputs import BaseModelOutputWithPooling from ...modeling_utils import ALL_ATTENTION_FUNCTIONS @@ -53,23 +54,17 @@ def __init__(self, config, bias: bool = True): @auto_docstring(checkpoint="zai-org/GLM-OCR") +@strict(accept_kwargs=True) class GlmOcrVisionConfig(Glm4vVisionConfig): - def __init__( - self, - depth=24, - hidden_size=1024, - hidden_act="silu", - attention_bias=True, - num_heads=16, - image_size=336, - out_hidden_size=1536, - intermediate_size=4096, - **super_kwargs, - ): - super().__init__(**super_kwargs) + hidden_size: int = 1024 + attention_bias: bool = True + num_heads: int = 16 + out_hidden_size: int = 1536 + intermediate_size: int = 4096 @auto_docstring(checkpoint="zai-org/GLM-OCR") +@strict(accept_kwargs=True) class GlmOcrTextConfig(Glm4vTextConfig): r""" Example: @@ -87,21 +82,17 @@ class GlmOcrTextConfig(Glm4vTextConfig): >>> configuration = model.config ```""" - def __init__( - self, - vocab_size: int | None = 59392, - hidden_size: int | None = 1024, - intermediate_size: int | None = 4096, - num_hidden_layers: int | None = 16, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 8, - max_position_embeddings: int | None = 131072, - **super_kwargs, - ): - super().__init__(**super_kwargs) + vocab_size: int = 59392 + hidden_size: int = 1024 + intermediate_size: int = 4096 + num_hidden_layers: int = 16 + num_attention_heads: int = 16 + num_key_value_heads: int = 8 + max_position_embeddings: int = 131072 @auto_docstring(checkpoint="zai-org/GLM-OCR") +@strict(accept_kwargs=True) class GlmOcrConfig(Glm4vConfig): r""" image_start_token_id (`int`, *optional*, defaults to 59256): @@ -126,20 +117,12 @@ class GlmOcrConfig(Glm4vConfig): >>> configuration = model.config ```""" - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=59280, - video_token_id=59281, - image_start_token_id=59256, - image_end_token_id=59257, - video_start_token_id=59258, - video_end_token_id=59259, - tie_word_embeddings=False, - **super_kwargs, - ): - super().__init__(**super_kwargs) + image_token_id: int = 59280 + video_token_id: int = 59281 + image_start_token_id: int = 59256 + image_end_token_id: int = 59257 + video_start_token_id: int = 59258 + video_end_token_id: int = 59259 class GlmOcrTextAttention(Glm4vTextAttention, nn.Module): diff --git a/src/transformers/models/glmasr/configuration_glmasr.py b/src/transformers/models/glmasr/configuration_glmasr.py index 5ef0f2df8340..8adb06a0d060 100644 --- a/src/transformers/models/glmasr/configuration_glmasr.py +++ b/src/transformers/models/glmasr/configuration_glmasr.py @@ -12,12 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="zai-org/GLM-ASR-Nano-2512") +@strict(accept_kwargs=True) class GlmAsrEncoderConfig(PreTrainedConfig): r""" Example: @@ -37,41 +41,28 @@ class GlmAsrEncoderConfig(PreTrainedConfig): model_type = "glmasr_encoder" - def __init__( - self, - hidden_size=1280, - intermediate_size=5120, - num_hidden_layers=32, - num_attention_heads=20, - num_key_value_heads=None, - hidden_act="gelu", - max_position_embeddings=1500, - initializer_range=0.02, - rope_parameters=None, - attention_dropout=0.0, - num_mel_bins=128, - **kwargs, - ): - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.head_dim = hidden_size // num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.rope_parameters = rope_parameters - self.attention_dropout = attention_dropout - self.num_mel_bins = num_mel_bins + hidden_size: int = 1280 + intermediate_size: int = 5120 + num_hidden_layers: int = 32 + num_attention_heads: int = 20 + num_key_value_heads: int | None = None + hidden_act: str = "gelu" + max_position_embeddings: int = 1500 + initializer_range: float = 0.02 + rope_parameters: dict | None = None + attention_dropout: float | int = 0.0 + num_mel_bins: int = 128 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads kwargs.setdefault("partial_rotary_factor", 0.5) - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="zai-org/GLM-ASR-Nano-2512") +@strict(accept_kwargs=True) class GlmAsrConfig(PreTrainedConfig): r""" Example: @@ -106,36 +97,27 @@ class GlmAsrConfig(PreTrainedConfig): "rope_parameters": {"rope_theta": 10000.0, "rope_type": "default"}, } - def __init__( - self, - audio_config=None, - text_config=None, - audio_token_id=59260, - projector_hidden_act="gelu", - **kwargs, - ): - if isinstance(audio_config, dict): - audio_config["model_type"] = audio_config.get("model_type", "glmasr_encoder") - audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config) - elif audio_config is None: - audio_config = CONFIG_MAPPING["glmasr_encoder"]() - self.audio_config = audio_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]]( - **{**self._default_text_config_kwargs, **text_config} + audio_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + audio_token_id: int = 59260 + projector_hidden_act: str = "gelu" + + def __post_init__(self, **kwargs): + if isinstance(self.audio_config, dict): + self.audio_config["model_type"] = self.audio_config.get("model_type", "glmasr_encoder") + self.audio_config = CONFIG_MAPPING[self.audio_config["model_type"]](**self.audio_config) + elif self.audio_config is None: + self.audio_config = CONFIG_MAPPING["glmasr_encoder"]() + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]]( + **{**self._default_text_config_kwargs, **self.text_config} ) - elif text_config is None: - text_config = CONFIG_MAPPING["llama"](**self._default_text_config_kwargs) - self.text_config = text_config - - self.vocab_size = text_config.vocab_size - self.hidden_size = text_config.hidden_size - self.audio_token_id = audio_token_id - self.projector_hidden_act = projector_hidden_act + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["llama"](**self._default_text_config_kwargs) - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["GlmAsrEncoderConfig", "GlmAsrConfig"] diff --git a/src/transformers/models/glpn/configuration_glpn.py b/src/transformers/models/glpn/configuration_glpn.py index af1bc09741ec..fdd9b85b6549 100644 --- a/src/transformers/models/glpn/configuration_glpn.py +++ b/src/transformers/models/glpn/configuration_glpn.py @@ -13,14 +13,14 @@ # limitations under the License. """GLPN model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="vinvino02/glpn-kitti") +@strict(accept_kwargs=True) class GLPNConfig(PreTrainedConfig): r""" num_encoder_blocks (`int`, *optional*, defaults to 4): @@ -64,48 +64,24 @@ class GLPNConfig(PreTrainedConfig): model_type = "glpn" - def __init__( - self, - num_channels=3, - num_encoder_blocks=4, - depths=[2, 2, 2, 2], - sr_ratios=[8, 4, 2, 1], - hidden_sizes=[32, 64, 160, 256], - patch_sizes=[7, 3, 3, 3], - strides=[4, 2, 2, 2], - num_attention_heads=[1, 2, 5, 8], - mlp_ratios=[4, 4, 4, 4], - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - drop_path_rate=0.1, - layer_norm_eps=1e-6, - decoder_hidden_size=64, - max_depth=10, - head_in_index=-1, - **kwargs, - ): - super().__init__(**kwargs) - - self.num_channels = num_channels - self.num_encoder_blocks = num_encoder_blocks - self.depths = depths - self.sr_ratios = sr_ratios - self.hidden_sizes = hidden_sizes - self.patch_sizes = patch_sizes - self.strides = strides - self.mlp_ratios = mlp_ratios - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.drop_path_rate = drop_path_rate - self.layer_norm_eps = layer_norm_eps - self.decoder_hidden_size = decoder_hidden_size - self.max_depth = max_depth - self.head_in_index = head_in_index + num_channels: int = 3 + num_encoder_blocks: int = 4 + depths: list[int] | tuple[int, ...] = (2, 2, 2, 2) + sr_ratios: list[int] | tuple[int, ...] = (8, 4, 2, 1) + hidden_sizes: list[int] | tuple[int, ...] = (32, 64, 160, 256) + patch_sizes: list[int] | tuple[int, ...] = (7, 3, 3, 3) + strides: list[int] | tuple[int, ...] = (4, 2, 2, 2) + num_attention_heads: list[int] | tuple[int, ...] = (1, 2, 5, 8) + mlp_ratios: list[int] | tuple[int, ...] = (4, 4, 4, 4) + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + drop_path_rate: float = 0.1 + layer_norm_eps: float = 1e-6 + decoder_hidden_size: int = 64 + max_depth: int = 10 + head_in_index: int = -1 __all__ = ["GLPNConfig"] diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py index 02efa1762e20..2150c98d7e4c 100755 --- a/src/transformers/models/glpn/modeling_glpn.py +++ b/src/transformers/models/glpn/modeling_glpn.py @@ -415,7 +415,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict encoder_outputs = self.encoder( pixel_values, @@ -637,7 +637,7 @@ def forward( >>> depth = depth.detach().cpu().numpy() >>> depth = Image.fromarray(depth.astype("uint8")) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/got_ocr2/configuration_got_ocr2.py b/src/transformers/models/got_ocr2/configuration_got_ocr2.py index 48653ed932f8..aed7d8fbddb7 100644 --- a/src/transformers/models/got_ocr2/configuration_got_ocr2.py +++ b/src/transformers/models/got_ocr2/configuration_got_ocr2.py @@ -18,12 +18,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="facebook/sam-vit-huge") +@strict(accept_kwargs=True) class GotOcr2VisionConfig(PreTrainedConfig): r""" output_channels (`int`, *optional*, defaults to 256): @@ -41,50 +45,27 @@ class GotOcr2VisionConfig(PreTrainedConfig): """ base_config_key = "vision_config" - - def __init__( - self, - hidden_size=768, - output_channels=256, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=1024, - patch_size=16, - hidden_act="gelu", - layer_norm_eps=1e-06, - attention_dropout=0.0, - initializer_range=1e-10, - qkv_bias=True, - use_abs_pos=True, - use_rel_pos=True, - window_size=14, - global_attn_indexes=[2, 5, 8, 11], - mlp_dim=3072, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.output_channels = output_channels - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.image_size = image_size - self.patch_size = patch_size - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.qkv_bias = qkv_bias - self.use_abs_pos = use_abs_pos - self.use_rel_pos = use_rel_pos - self.window_size = window_size - self.global_attn_indexes = global_attn_indexes - self.mlp_dim = mlp_dim + hidden_size: int = 768 + output_channels: int = 256 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-06 + attention_dropout: float | int = 0.0 + initializer_range: float = 1e-10 + qkv_bias: bool = True + use_abs_pos: bool = True + use_rel_pos: bool = True + window_size: int = 14 + global_attn_indexes: list[int] | tuple[int, ...] = (2, 5, 8, 11) + mlp_dim: int = 3072 @auto_docstring(checkpoint="facebook/sam-vit-huge") +@strict(accept_kwargs=True) class GotOcr2Config(PreTrainedConfig): r""" Example: @@ -108,30 +89,23 @@ class GotOcr2Config(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "vision_config": GotOcr2VisionConfig} - def __init__( - self, - vision_config: dict | None = None, - text_config: dict | None = None, - image_token_index: int | None = 151859, - image_seq_length: int | None = 576, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.image_token_index = image_token_index - self.image_seq_length = image_seq_length - - if vision_config is None: + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 151859 + image_seq_length: int = 576 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.vision_config is None: self.vision_config = GotOcr2VisionConfig() - elif isinstance(vision_config, dict): - self.vision_config = GotOcr2VisionConfig(**vision_config) - elif isinstance(vision_config, GotOcr2VisionConfig): - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "qwen2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["qwen2"]( + elif isinstance(self.vision_config, dict): + self.vision_config = GotOcr2VisionConfig(**self.vision_config) + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["qwen2"]( vocab_size=151860, hidden_size=1024, intermediate_size=2816, @@ -143,7 +117,7 @@ def __init__( initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, - tie_word_embeddings=tie_word_embeddings, + tie_word_embeddings=self.tie_word_embeddings, rope_theta=1000000.0, rope_parameters=None, use_sliding_window=False, @@ -152,10 +126,7 @@ def __init__( attention_dropout=0.0, ) - self.text_config = text_config - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["GotOcr2VisionConfig", "GotOcr2Config"] diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py index 78ac9c12dfe0..0859e0041119 100644 --- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import collections from dataclasses import dataclass diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py index 1af1c372e799..896498b065c8 100644 --- a/src/transformers/models/got_ocr2/modular_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. + import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...cache_utils import Cache @@ -43,6 +45,7 @@ @auto_docstring(checkpoint="facebook/sam-vit-huge") +@strict(accept_kwargs=True) class GotOcr2VisionConfig(PreTrainedConfig): r""" output_channels (`int`, *optional*, defaults to 256): @@ -60,50 +63,27 @@ class GotOcr2VisionConfig(PreTrainedConfig): """ base_config_key = "vision_config" - - def __init__( - self, - hidden_size=768, - output_channels=256, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=1024, - patch_size=16, - hidden_act="gelu", - layer_norm_eps=1e-06, - attention_dropout=0.0, - initializer_range=1e-10, - qkv_bias=True, - use_abs_pos=True, - use_rel_pos=True, - window_size=14, - global_attn_indexes=[2, 5, 8, 11], - mlp_dim=3072, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.output_channels = output_channels - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.image_size = image_size - self.patch_size = patch_size - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.qkv_bias = qkv_bias - self.use_abs_pos = use_abs_pos - self.use_rel_pos = use_rel_pos - self.window_size = window_size - self.global_attn_indexes = global_attn_indexes - self.mlp_dim = mlp_dim + hidden_size: int = 768 + output_channels: int = 256 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-06 + attention_dropout: float | int = 0.0 + initializer_range: float = 1e-10 + qkv_bias: bool = True + use_abs_pos: bool = True + use_rel_pos: bool = True + window_size: int = 14 + global_attn_indexes: list[int] | tuple[int, ...] = (2, 5, 8, 11) + mlp_dim: int = 3072 @auto_docstring(checkpoint="facebook/sam-vit-huge") +@strict(accept_kwargs=True) class GotOcr2Config(PreTrainedConfig): r""" Example: @@ -127,30 +107,23 @@ class GotOcr2Config(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "vision_config": GotOcr2VisionConfig} - def __init__( - self, - vision_config: dict | None = None, - text_config: dict | None = None, - image_token_index: int | None = 151859, - image_seq_length: int | None = 576, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.image_token_index = image_token_index - self.image_seq_length = image_seq_length - - if vision_config is None: + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 151859 + image_seq_length: int = 576 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.vision_config is None: self.vision_config = GotOcr2VisionConfig() - elif isinstance(vision_config, dict): - self.vision_config = GotOcr2VisionConfig(**vision_config) - elif isinstance(vision_config, GotOcr2VisionConfig): - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "qwen2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["qwen2"]( + elif isinstance(self.vision_config, dict): + self.vision_config = GotOcr2VisionConfig(**self.vision_config) + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["qwen2"]( vocab_size=151860, hidden_size=1024, intermediate_size=2816, @@ -162,7 +135,7 @@ def __init__( initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, - tie_word_embeddings=tie_word_embeddings, + tie_word_embeddings=self.tie_word_embeddings, rope_theta=1000000.0, rope_parameters=None, use_sliding_window=False, @@ -171,10 +144,7 @@ def __init__( attention_dropout=0.0, ) - self.text_config = text_config - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) class GotOcr2MLPBlock(SamMLPBlock): diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py index 8199c76327c9..dd81dea1f2d6 100644 --- a/src/transformers/models/gpt2/configuration_gpt2.py +++ b/src/transformers/models/gpt2/configuration_gpt2.py @@ -14,14 +14,14 @@ # limitations under the License. """OpenAI GPT-2 configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="openai-community/gpt2") +@strict(accept_kwargs=True) class GPT2Config(PreTrainedConfig): r""" summary_type (`string`, *optional*, defaults to `"cls_index"`): @@ -75,65 +75,32 @@ class GPT2Config(PreTrainedConfig): "num_hidden_layers": "n_layer", } - def __init__( - self, - vocab_size=50257, - n_positions=1024, - n_embd=768, - n_layer=12, - n_head=12, - n_inner=None, - activation_function="gelu_new", - resid_pdrop=0.1, - embd_pdrop=0.1, - attn_pdrop=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - summary_type="cls_index", - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, - scale_attn_weights=True, - use_cache=True, - bos_token_id=50256, - eos_token_id=50256, - pad_token_id=None, - scale_attn_by_inverse_layer_idx=False, - reorder_and_upcast_attn=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - self.add_cross_attention = add_cross_attention - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.n_inner = n_inner - self.activation_function = activation_function - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_first_dropout = summary_first_dropout - self.summary_proj_to_labels = summary_proj_to_labels - self.scale_attn_weights = scale_attn_weights - self.use_cache = use_cache - self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx - self.reorder_and_upcast_attn = reorder_and_upcast_attn - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - - super().__init__(**kwargs) + vocab_size: int = 50257 + n_positions: int = 1024 + n_embd: int = 768 + n_layer: int = 12 + n_head: int = 12 + n_inner: int | None = None + activation_function: str = "gelu_new" + resid_pdrop: float = 0.1 + embd_pdrop: float = 0.1 + attn_pdrop: float = 0.1 + layer_norm_epsilon: float = 1e-5 + initializer_range: float = 0.02 + summary_type: str = "cls_index" + summary_use_proj: bool = True + summary_activation: str | None = None + summary_proj_to_labels: bool = True + summary_first_dropout: float | int = 0.1 + scale_attn_weights: bool = True + use_cache: bool = True + bos_token_id: int | None = 50256 + eos_token_id: int | None = 50256 + pad_token_id: int | None = None + scale_attn_by_inverse_layer_idx: bool = False + reorder_and_upcast_attn: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["GPT2Config"] diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py index 4527dbbeca95..3b8c9b27871a 100644 --- a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py @@ -13,14 +13,14 @@ # limitations under the License. """GPTBigCode configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/gpt_bigcode") +@strict(accept_kwargs=True) class GPTBigCodeConfig(PreTrainedConfig): r""" multi_query (`bool`, *optional*, defaults to `True`): @@ -58,58 +58,32 @@ class GPTBigCodeConfig(PreTrainedConfig): "num_hidden_layers": "n_layer", } - def __init__( - self, - vocab_size=50257, - n_positions=1024, - n_embd=768, - n_layer=12, - n_head=12, - n_inner=None, - activation_function="gelu_pytorch_tanh", - resid_pdrop=0.1, - embd_pdrop=0.1, - attn_pdrop=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - scale_attn_weights=True, - use_cache=True, - bos_token_id=50256, - eos_token_id=50256, - pad_token_id=None, - attention_softmax_in_fp32=True, - scale_attention_softmax_in_fp32=True, - multi_query=True, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - self.add_cross_attention = add_cross_attention - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.n_inner = n_inner - self.activation_function = activation_function - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.scale_attn_weights = scale_attn_weights - self.use_cache = use_cache - self.attention_softmax_in_fp32 = attention_softmax_in_fp32 - self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32 - self.multi_query = multi_query - self.num_key_value_heads = 1 if multi_query else n_head - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - - super().__init__(**kwargs) + vocab_size: int = 50257 + n_positions: int = 1024 + n_embd: int = 768 + n_layer: int = 12 + n_head: int = 12 + n_inner: int | None = None + activation_function: str = "gelu_pytorch_tanh" + resid_pdrop: float = 0.1 + embd_pdrop: float = 0.1 + attn_pdrop: float = 0.1 + layer_norm_epsilon: float = 1e-5 + initializer_range: float = 0.02 + scale_attn_weights: bool = True + use_cache: bool = True + bos_token_id: int | None = 50256 + eos_token_id: int | None = 50256 + pad_token_id: int | None = None + attention_softmax_in_fp32: bool = True + scale_attention_softmax_in_fp32: bool = True + multi_query: bool = True + add_cross_attention: bool = False + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.num_key_value_heads = 1 if self.multi_query else self.n_head + super().__post_init__(**kwargs) __all__ = ["GPTBigCodeConfig"] diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py index e5831b5517ae..b06bcf5f1ab5 100644 --- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py +++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py @@ -13,14 +13,14 @@ # limitations under the License. """GPT Neo model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="EleutherAI/gpt-neo-1.3B") +@strict(accept_kwargs=True) class GPTNeoConfig(PreTrainedConfig): r""" window_size (`int`, *optional*, defaults to 256): @@ -49,54 +49,35 @@ class GPTNeoConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] attribute_map = {"num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"} - def __init__( - self, - vocab_size=50257, - max_position_embeddings=2048, - hidden_size=2048, - num_layers=24, - attention_types=[[["global", "local"], 12]], - num_heads=16, - intermediate_size=None, - window_size=256, - activation_function="gelu_new", - resid_dropout=0.0, - embed_dropout=0.0, - attention_dropout=0.0, - classifier_dropout=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - use_cache=True, - bos_token_id=50256, - eos_token_id=50256, - pad_token_id=None, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_layers = num_layers - self.num_heads = num_heads - self.intermediate_size = intermediate_size - self.window_size = window_size - self.activation_function = activation_function - self.resid_dropout = resid_dropout - self.embed_dropout = embed_dropout - self.attention_dropout = attention_dropout - self.classifier_dropout = classifier_dropout - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.use_cache = use_cache - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.attention_types = attention_types - self.attention_layers = self.expand_attention_types_params(attention_types) - + vocab_size: int = 50257 + max_position_embeddings: int = 2048 + hidden_size: int = 2048 + num_layers: int = 24 + attention_types: list | tuple | None = None + num_heads: int = 16 + intermediate_size: int | None = None + window_size: int = 256 + activation_function: str = "gelu_new" + resid_dropout: float | int = 0.0 + embed_dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + classifier_dropout: float | int = 0.1 + layer_norm_epsilon: float = 1e-5 + initializer_range: float = 0.02 + use_cache: bool = True + bos_token_id: int | None = 50256 + eos_token_id: int | None = 50256 + pad_token_id: int | None = None + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.attention_types is None: + self.attention_types = [[["global", "local"], 12]] + self.attention_layers = self.expand_attention_types_params(self.attention_types) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if len(self.attention_layers) != self.num_layers: raise ValueError( "Configuration for convolutional module is incorrect. " @@ -107,8 +88,6 @@ def __init__( "Please verify the value of `config.attention_types` argument." ) - super().__init__(**kwargs) - @staticmethod def expand_attention_types_params(attention_types): attentions = [] diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index f408b646d35b..5bf0bc584d85 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -428,7 +428,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -561,7 +561,7 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -657,7 +657,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -779,7 +779,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -856,7 +856,7 @@ def forward( [What are input IDs?](../glossary#input-ids) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index f962568d10b3..56fdc82573f8 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -13,15 +13,15 @@ # limitations under the License. """GPTNeoX model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="EleutherAI/gpt-neox-20b") +@strict(accept_kwargs=True) class GPTNeoXConfig(PreTrainedConfig): r""" use_parallel_residual (`bool`, *optional*, defaults to `True`): @@ -58,60 +58,36 @@ class GPTNeoXConfig(PreTrainedConfig): "final_layer_norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 50432, - hidden_size: int | None = 6144, - num_hidden_layers: int | None = 44, - num_attention_heads: int | None = 64, - intermediate_size: int | None = 24576, - hidden_act: str | None = "gelu", - attention_dropout: float | None = 0.0, - hidden_dropout: float | None = 0.0, - classifier_dropout: float | None = 0.1, - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - layer_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - bos_token_id: int | None = 0, - eos_token_id: int | None = 2, - pad_token_id: int | None = None, - tie_word_embeddings: bool | None = False, - use_parallel_residual: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = True, - is_decoder: bool | None = False, - **kwargs, - ): - self.is_decoder = is_decoder - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.attention_dropout = attention_dropout - self.hidden_dropout = hidden_dropout - self.classifier_dropout = classifier_dropout - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.use_parallel_residual = use_parallel_residual - self.attention_bias = attention_bias - self.rope_parameters = rope_parameters - self.tie_word_embeddings = tie_word_embeddings - + vocab_size: int = 50432 + hidden_size: int = 6144 + num_hidden_layers: int = 44 + num_attention_heads: int = 64 + intermediate_size: int = 24576 + hidden_act: str = "gelu" + attention_dropout: float | int = 0.0 + hidden_dropout: float | int = 0.0 + classifier_dropout: float | int = 0.1 + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + use_cache: bool = True + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + pad_token_id: int | None = None + tie_word_embeddings: bool = False + use_parallel_residual: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = True + is_decoder: bool = False + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if self.hidden_size % self.num_attention_heads != 0: raise ValueError( "The hidden size is not divisible by the number of attention heads! Make sure to update them!" ) - super().__init__(**kwargs) - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} @@ -121,7 +97,6 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", self.default_theta)) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index e3af04493f61..10e4b5922add 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -166,7 +166,7 @@ def eager_attention_forward( value: torch.Tensor, attention_mask: torch.Tensor, scaling: float, - dropout: float = 0.0, + dropout: float | int = 0.0, **kwargs, ): attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling diff --git a/src/transformers/models/gpt_neox/modular_gpt_neox.py b/src/transformers/models/gpt_neox/modular_gpt_neox.py index e32f24fcced9..f778501b7b38 100644 --- a/src/transformers/models/gpt_neox/modular_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modular_gpt_neox.py @@ -120,7 +120,7 @@ def eager_attention_forward( value: torch.Tensor, attention_mask: torch.Tensor, scaling: float, - dropout: float = 0.0, + dropout: float | int = 0.0, **kwargs, ): attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index b7a12262e926..e6e7e306b9b5 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -13,15 +13,15 @@ # limitations under the License. """GPTNeoX Japanese model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="abeja/gpt-neox-japanese-2.7b") +@strict(accept_kwargs=True) class GPTNeoXJapaneseConfig(PreTrainedConfig): r""" intermediate_multiple_size (`int`, *optional*, defaults to 4): @@ -45,50 +45,26 @@ class GPTNeoXJapaneseConfig(PreTrainedConfig): model_type = "gpt_neox_japanese" - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 2560, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - intermediate_multiple_size: int | None = 4, - hidden_act: str | None = "gelu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - layer_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - bos_token_id: int | None = 31996, - eos_token_id: int | None = 31999, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_dropout: float | None = 0.1, - hidden_dropout: float | None = 0.0, - is_decoder: bool | None = False, - pad_token_id: int | None = None, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.is_decoder = is_decoder - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_multiple_size = intermediate_multiple_size - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.hidden_dropout = hidden_dropout - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) - - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + vocab_size: int = 32000 + hidden_size: int = 2560 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + intermediate_multiple_size: int = 4 + hidden_act: str = "gelu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + use_cache: bool = True + bos_token_id: int | None = 31996 + eos_token_id: int | list[int] | None = 31999 + rope_parameters: RopeParameters | dict | None = None + attention_dropout: float | int = 0.1 + hidden_dropout: float | int = 0.0 + is_decoder: bool = False + pad_token_id: int | None = None + tie_word_embeddings: bool = True + + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} @@ -98,7 +74,6 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", self.default_theta)) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 1.0) self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index fbef4ee4d406..aea4c9b391f4 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -444,7 +444,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict use_cache = use_cache if use_cache is not None else self.config.use_cache if (input_ids is None) ^ (inputs_embeds is not None): @@ -578,7 +578,7 @@ def forward( >>> prediction_logits = outputs.logits ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.gpt_neox_japanese( input_ids, diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py index 5a0904130455..3e1163869753 100644 --- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py +++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py @@ -13,12 +13,14 @@ # limitations under the License. """openai model configuration""" -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="openai/gpt-oss-20b") +@strict(accept_kwargs=True) class GptOssConfig(PreTrainedConfig): model_type = "gpt_oss" default_theta = 150000.0 @@ -36,78 +38,52 @@ class GptOssConfig(PreTrainedConfig): "layers.*.mlp.experts": "moe_tp_experts", } - def __init__( - self, - num_hidden_layers: int | None = 36, - num_local_experts: int | None = 128, - vocab_size: int | None = 201088, - hidden_size: int | None = 2880, - intermediate_size: int | None = 2880, - head_dim: int | None = 64, - num_attention_heads: int | None = 64, - num_key_value_heads: int | None = 8, - sliding_window: int | None = 128, - tie_word_embeddings: bool | None = False, - hidden_act: str | None = "silu", - initializer_range: float | None = 0.02, - max_position_embeddings: int | None = 131072, - rms_norm_eps: float | None = 1e-5, - rope_parameters: RopeParameters | None = { - "rope_type": "yarn", - "factor": 32.0, - "beta_fast": 32.0, - "beta_slow": 1.0, - "truncate": False, - "original_max_position_embeddings": 4096, - }, - attention_dropout: float | None = 0.0, - num_experts_per_tok: int | None = 4, - router_aux_loss_coef: float | None = 0.9, - output_router_logits: bool | None = False, - use_cache: bool | None = True, - layer_types: list[str] | None = None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_local_experts = num_local_experts - self.sliding_window = sliding_window - self.num_experts_per_tok = num_experts_per_tok - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + num_hidden_layers: int = 36 + num_local_experts: int = 128 + vocab_size: int = 201088 + hidden_size: int = 2880 + intermediate_size: int = 2880 + head_dim: int = 64 + num_attention_heads: int = 64 + num_key_value_heads: int = 8 + sliding_window: int | None = 128 + tie_word_embeddings: bool = False + hidden_act: str = "silu" + initializer_range: float = 0.02 + max_position_embeddings: int = 131072 + rms_norm_eps: float = 1e-5 + rope_parameters: dict | None = None + attention_dropout: float | int = 0.0 + num_experts_per_tok: int = 4 + router_aux_loss_coef: float = 0.9 + output_router_logits: bool = False + use_cache: bool = True + layer_types: list[str] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + attention_bias: bool = True + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.attention_dropout = attention_dropout - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.layer_types = layer_types + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads if self.layer_types is None: self.layer_types = [ "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.attention_bias = True - self.max_position_embeddings = max_position_embeddings - self.router_aux_loss_coef = router_aux_loss_coef - self.output_router_logits = output_router_logits - self.use_cache = use_cache - self.rope_parameters = rope_parameters - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + if self.rope_parameters is None: + self.rope_parameters = { + "rope_type": "yarn", + "factor": 32.0, + "beta_fast": 32.0, + "beta_slow": 1.0, + "truncate": False, + "original_max_position_embeddings": 4096, + } + super().__post_init__(**kwargs) __all__ = ["GptOssConfig"] diff --git a/src/transformers/models/gpt_oss/modeling_gpt_oss.py b/src/transformers/models/gpt_oss/modeling_gpt_oss.py index 9f87e09a155a..c94ae36ea099 100644 --- a/src/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/src/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -249,7 +249,7 @@ def eager_attention_forward( value: torch.Tensor, attention_mask: torch.Tensor | None, scaling: float, - dropout: float = 0.0, + dropout: float | int = 0.0, **kwargs, ): key_states = repeat_kv(key, module.num_key_value_groups) diff --git a/src/transformers/models/gpt_oss/modular_gpt_oss.py b/src/transformers/models/gpt_oss/modular_gpt_oss.py index 247bf6f14983..ee175b55fb77 100644 --- a/src/transformers/models/gpt_oss/modular_gpt_oss.py +++ b/src/transformers/models/gpt_oss/modular_gpt_oss.py @@ -186,7 +186,7 @@ def eager_attention_forward( value: torch.Tensor, attention_mask: torch.Tensor | None, scaling: float, - dropout: float = 0.0, + dropout: float | int = 0.0, **kwargs, ): key_states = repeat_kv(key, module.num_key_value_groups) diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py index 96cc2c297063..b1ee0f363ebc 100644 --- a/src/transformers/models/gptj/configuration_gptj.py +++ b/src/transformers/models/gptj/configuration_gptj.py @@ -13,14 +13,14 @@ # limitations under the License. """GPT-J model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="EleutherAI/gpt-j-6B") +@strict(accept_kwargs=True) class GPTJConfig(PreTrainedConfig): r""" rotary_dim (`int`, *optional*, defaults to 64): @@ -49,48 +49,24 @@ class GPTJConfig(PreTrainedConfig): "num_hidden_layers": "n_layer", } - def __init__( - self, - vocab_size=50400, - n_positions=2048, - n_embd=4096, - n_layer=28, - n_head=16, - rotary_dim=64, - n_inner=None, - activation_function="gelu_new", - resid_pdrop=0.0, - embd_pdrop=0.0, - attn_pdrop=0.0, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - use_cache=True, - bos_token_id=50256, - eos_token_id=50256, - pad_token_id=None, - tie_word_embeddings=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.n_inner = n_inner - self.rotary_dim = rotary_dim - self.activation_function = activation_function - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.use_cache = use_cache - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vocab_size: int = 50400 + n_positions: int = 2048 + n_embd: int = 4096 + n_layer: int = 28 + n_head: int = 16 + rotary_dim: int = 64 + n_inner: int | None = None + activation_function: str = "gelu_new" + resid_pdrop: float = 0.0 + embd_pdrop: float = 0.0 + attn_pdrop: float = 0.0 + layer_norm_epsilon: float = 1e-5 + initializer_range: float = 0.02 + use_cache: bool = True + bos_token_id: int | None = 50256 + eos_token_id: int | None = 50256 + pad_token_id: int | None = None + tie_word_embeddings: bool = False __all__ = ["GPTJConfig"] diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 007cc6fd9822..d28ba6a2afb7 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -492,7 +492,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -623,7 +623,7 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -711,7 +711,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -819,7 +819,7 @@ def forward( is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index 51499c695370..4e891eabca3a 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -18,15 +18,15 @@ # limitations under the License. """Granite model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="ibm-granite/granite-3.0-8b-base") +@strict(accept_kwargs=True) class GraniteConfig(PreTrainedConfig): r""" embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier @@ -65,64 +65,35 @@ class GraniteConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - embedding_multiplier: float | None = 1.0, - logits_scaling: float | None = 1.0, - residual_multiplier: float | None = 1.0, - attention_multiplier: float | None = 1.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - - self.embedding_multiplier = embedding_multiplier - self.logits_scaling = logits_scaling - self.residual_multiplier = residual_multiplier - self.attention_multiplier = attention_multiplier - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + mlp_bias: bool = False + embedding_multiplier: float = 1.0 + logits_scaling: float = 1.0 + residual_multiplier: float = 1.0 + attention_multiplier: float = 1.0 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) __all__ = ["GraniteConfig"] diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py index 20c51993f74c..9d136ecbf7fe 100644 --- a/src/transformers/models/granite_speech/configuration_granite_speech.py +++ b/src/transformers/models/granite_speech/configuration_granite_speech.py @@ -13,12 +13,15 @@ # limitations under the License. """Config class for Granite Speech.""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="ibm-granite/granite-speech-3.2-8b") +@strict(accept_kwargs=True) class GraniteSpeechEncoderConfig(PreTrainedConfig): r""" feedforward_mult (`int`, *optional*, defaults to 4): @@ -51,38 +54,22 @@ class GraniteSpeechEncoderConfig(PreTrainedConfig): model_type = "granite_speech_encoder" - def __init__( - self, - input_dim=160, - num_layers=10, - hidden_dim=1024, - feedforward_mult=4, - num_heads=8, - dim_head=128, - output_dim=42, - context_size=200, - max_pos_emb=512, - dropout=0.1, - conv_kernel_size=15, - conv_expansion_factor=2, - **kwargs, - ): - super().__init__(**kwargs) - self.input_dim = input_dim - self.num_layers = num_layers - self.hidden_dim = hidden_dim - self.feedforward_mult = feedforward_mult - self.num_heads = num_heads - self.dim_head = dim_head - self.output_dim = output_dim - self.context_size = context_size - self.dropout = dropout - self.conv_kernel_size = conv_kernel_size - self.conv_expansion_factor = conv_expansion_factor - self.max_pos_emb = max_pos_emb + input_dim: int = 160 + num_layers: int = 10 + hidden_dim: int = 1024 + feedforward_mult: int = 4 + num_heads: int = 8 + dim_head: int = 128 + output_dim: int = 42 + context_size: int = 200 + max_pos_emb: int = 512 + dropout: float | int = 0.1 + conv_kernel_size: int = 15 + conv_expansion_factor: int = 2 @auto_docstring(checkpoint="ibm-granite/granite-speech-3.2-8b") +@strict(accept_kwargs=True) class GraniteSpeechConfig(PreTrainedConfig): r""" has_lora_adapter (`bool`, *optional*, defaults to `True`): @@ -120,43 +107,33 @@ class GraniteSpeechConfig(PreTrainedConfig): "projector_config": AutoConfig, } - def __init__( - self, - text_config=None, - encoder_config=None, - projector_config=None, - audio_token_index=49155, - initializer_range=0.02, - has_lora_adapter=True, - downsample_rate=5, - window_size=15, - **kwargs, - ): - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "granite") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["granite"]() - - if isinstance(projector_config, dict): - projector_config["model_type"] = projector_config.get("model_type", "blip_2_qformer") - projector_config = CONFIG_MAPPING[projector_config["model_type"]](**projector_config) - elif projector_config is None: - projector_config = CONFIG_MAPPING["blip_2_qformer"]() - - if not isinstance(encoder_config, GraniteSpeechEncoderConfig): - encoder_config = {} if encoder_config is None else encoder_config - encoder_config = GraniteSpeechEncoderConfig(**encoder_config) - - self.text_config = text_config - self.encoder_config = encoder_config - self.projector_config = projector_config - self.audio_token_index = audio_token_index - self.initializer_range = initializer_range - self.has_lora_adapter = has_lora_adapter - self.downsample_rate = downsample_rate - self.window_size = window_size - super().__init__(**kwargs) + text_config: dict | PreTrainedConfig | None = None + encoder_config: dict | PreTrainedConfig | None = None + projector_config: dict | PreTrainedConfig | None = None + audio_token_index: int = 49155 + initializer_range: float = 0.02 + has_lora_adapter: bool = True + downsample_rate: int = 5 + window_size: int = 15 + + def __post_init__(self, **kwargs): + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "granite") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["granite"]() + + if isinstance(self.projector_config, dict): + self.projector_config["model_type"] = self.projector_config.get("model_type", "blip_2_qformer") + self.projector_config = CONFIG_MAPPING[self.projector_config["model_type"]](**self.projector_config) + elif self.projector_config is None: + self.projector_config = CONFIG_MAPPING["blip_2_qformer"]() + + if not isinstance(self.encoder_config, GraniteSpeechEncoderConfig): + self.encoder_config = {} if self.encoder_config is None else self.encoder_config + self.encoder_config = GraniteSpeechEncoderConfig(**self.encoder_config) + + super().__post_init__(**kwargs) __all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechConfig"] diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py index 43359ec98b7e..d68646b9fdcc 100644 --- a/src/transformers/models/granite_speech/modeling_granite_speech.py +++ b/src/transformers/models/granite_speech/modeling_granite_speech.py @@ -409,7 +409,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index 0c744ada7600..4e22f425606f 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -18,15 +18,15 @@ # limitations under the License. """GraniteMoe model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="ibm-granite/granite-speech-3.2-8b") +@strict(accept_kwargs=True) class GraniteMoeConfig(PreTrainedConfig): r""" embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier @@ -50,72 +50,39 @@ class GraniteMoeConfig(PreTrainedConfig): model_type = "granitemoe" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - embedding_multiplier: float | None = 1.0, - logits_scaling: float | None = 1.0, - residual_multiplier: float | None = 1.0, - attention_multiplier: float | None = 1.0, - num_local_experts: int | None = 8, - num_experts_per_tok: int | None = 2, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + embedding_multiplier: float | None = 1.0 + logits_scaling: float | None = 1.0 + residual_multiplier: float | None = 1.0 + attention_multiplier: float | None = 1.0 + num_local_experts: int | None = 8 + num_experts_per_tok: int | None = 2 + output_router_logits: bool | None = False + router_aux_loss_coef: float | None = 0.001 + + def __post_init__(self, **kwargs): # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - self.embedding_multiplier = embedding_multiplier - self.logits_scaling = logits_scaling - self.residual_multiplier = residual_multiplier - self.attention_multiplier = attention_multiplier - - self.num_local_experts = num_local_experts - self.num_experts_per_tok = num_experts_per_tok - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.rope_parameters = rope_parameters + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["GraniteMoeConfig"] diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index 4bbe828a0079..48278a265572 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -14,15 +14,15 @@ # limitations under the License. """GraniteMoeHybrid model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="ibm-granite/granite-speech-3.2-8b") +@strict(accept_kwargs=True) class GraniteMoeHybridConfig(PreTrainedConfig): r""" embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier. @@ -40,7 +40,6 @@ class GraniteMoeHybridConfig(PreTrainedConfig): >>> # Initializing a GraniteMoeHybrid config >>> configuration = GraniteMoeHybridConfig() - >>> # Accessing the model configuration >>> configuration = model.config ```""" @@ -51,120 +50,76 @@ class GraniteMoeHybridConfig(PreTrainedConfig): } keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - embedding_multiplier: float | None = 1.0, - logits_scaling: float | None = 1.0, - residual_multiplier: float | None = 1.0, - attention_multiplier: float | None = 1.0, - num_local_experts: int | None = 8, - num_experts_per_tok: int | None = 2, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - shared_intermediate_size: int | None = 1024, - position_embedding_type: str | None = None, - layer_types: list[str] | None = None, - mamba_n_heads: int | None = 128, - mamba_n_groups: int | None = 1, - mamba_d_state: int | None = 256, - mamba_d_head: str | None = "auto", - mamba_d_conv: int | None = 4, - mamba_expand: int | None = 2, - mamba_chunk_size: int | None = 256, - mamba_conv_bias: bool | None = True, - mamba_proj_bias: bool | None = False, - time_step_min: float | None = 0.001, - time_step_max: float | None = 0.1, - time_step_limit: tuple[float, float] | None = (0.0, float("inf")), - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.embedding_multiplier = embedding_multiplier - self.logits_scaling = logits_scaling - self.residual_multiplier = residual_multiplier - self.attention_multiplier = attention_multiplier - self.attention_dropout = attention_dropout - self.num_local_experts = num_local_experts - self.num_experts_per_tok = num_experts_per_tok - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.shared_intermediate_size = shared_intermediate_size - self.position_embedding_type = position_embedding_type - self.rope_parameters = rope_parameters - - mamba_intermediate = mamba_expand * hidden_size - - if layer_types is not None and any(layer_type not in ["mamba", "attention"] for layer_type in layer_types): - raise ValueError("layer_types must be a list strings in [`mamba` `attention`]") - - if mamba_intermediate % mamba_n_heads != 0: + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + embedding_multiplier: int | float | None = 1.0 + logits_scaling: int | float | None = 1.0 + residual_multiplier: int | float | None = 1.0 + attention_multiplier: int | float | None = 1.0 + num_local_experts: int | None = 8 + num_experts_per_tok: int | None = 2 + output_router_logits: bool | None = False + router_aux_loss_coef: float | None = 0.001 + shared_intermediate_size: int = 1024 + position_embedding_type: str | None = None + layer_types: list[str] | None = None + mamba_n_heads: int | None = 128 + mamba_n_groups: int | None = 1 + mamba_d_state: int | None = 256 + mamba_d_head: int | str | None = "auto" + mamba_d_conv: int | None = 4 + mamba_expand: int | None = 2 + mamba_chunk_size: int | None = 256 + mamba_conv_bias: bool | None = True + mamba_proj_bias: bool | None = False + time_step_min: float | None = 0.001 + time_step_max: float | None = 0.1 + time_step_limit: list[float, float] | tuple[float, float] | None = (0.0, float("inf")) + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + mamba_intermediate = self.mamba_expand * self.hidden_size + if self.mamba_d_head == "auto": + self.mamba_d_head = mamba_intermediate // self.mamba_n_heads + + self.time_step_limit = tuple(self.time_step_limit) if self.time_step_limit is not None else None + if self.layer_types is None: + self.layer_types = ["mamba"] * self.num_hidden_layers + + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + + mamba_intermediate = self.mamba_expand * self.hidden_size + if mamba_intermediate % self.mamba_n_heads != 0: raise ValueError("mamba_n_heads must divide mamba_expand * hidden_size") - # for the mamba_v2, must satisfy the following - if mamba_d_head == "auto": - mamba_d_head = mamba_intermediate // mamba_n_heads - - if mamba_d_head * mamba_n_heads != mamba_intermediate: + if self.mamba_d_head * self.mamba_n_heads != mamba_intermediate: raise ValueError("The dimensions for the Mamba head state do not match the model intermediate_size") - self.mamba_n_heads = mamba_n_heads - self.mamba_d_head = mamba_d_head - self.mamba_n_groups = mamba_n_groups - self.mamba_d_state = mamba_d_state - self.mamba_d_conv = mamba_d_conv - self.mamba_chunk_size = mamba_chunk_size - self.mamba_conv_bias = mamba_conv_bias - self.mamba_proj_bias = mamba_proj_bias - self.time_step_min = time_step_min - self.time_step_max = time_step_max - self.time_step_limit = tuple(time_step_limit) if time_step_limit is not None else None - self.mamba_expand = mamba_expand - self.layer_types = layer_types - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) - # overwrite the function to use in `HybridMambaAttentionDynamicCache` @property def layers_block_type(self): - return self.layer_types if self.layer_types else ["mamba"] * self.num_hidden_layers + return self.layer_types __all__ = ["GraniteMoeHybridConfig"] diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index 0f3aa5c1e2ee..125aa4da0d7d 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -18,15 +18,15 @@ # limitations under the License. """GraniteMoeShared model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="ibm-granite/granite-speech-3.2-8b") +@strict(accept_kwargs=True) class GraniteMoeSharedConfig(PreTrainedConfig): r""" embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier @@ -34,6 +34,7 @@ class GraniteMoeSharedConfig(PreTrainedConfig): residual_multiplier (`float`, *optional*, defaults to 1.0): residual multiplier attention_multiplier (`float`, *optional*, defaults to 1.0): attention multiplier shared_intermediate_size (`int`, *optional*, defaults to 1024): intermediate size for shared experts. + position_embedding_type (`str`, *optional*): Positional embedding type to be used; defaults to None. Allowed options: `[None, "rope"]` ```python >>> from transformers import GraniteMoeSharedModel, GraniteMoeSharedConfig @@ -51,77 +52,39 @@ class GraniteMoeSharedConfig(PreTrainedConfig): model_type = "granitemoeshared" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - embedding_multiplier: float | None = 1.0, - logits_scaling: float | None = 1.0, - residual_multiplier: float | None = 1.0, - attention_multiplier: float | None = 1.0, - num_local_experts: int | None = 8, - num_experts_per_tok: int | None = 2, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - shared_intermediate_size: int | None = 0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - self.embedding_multiplier = embedding_multiplier - self.logits_scaling = logits_scaling - self.residual_multiplier = residual_multiplier - self.attention_multiplier = attention_multiplier - - self.num_local_experts = num_local_experts - self.num_experts_per_tok = num_experts_per_tok - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.shared_intermediate_size = shared_intermediate_size - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - # this model has rope embedding type, hardcoded for BC - self.position_embedding_type = "rope" - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + embedding_multiplier: float | None = 1.0 + logits_scaling: float | None = 1.0 + residual_multiplier: float | None = 1.0 + attention_multiplier: float | None = 1.0 + num_local_experts: int | None = 8 + num_experts_per_tok: int | None = 2 + output_router_logits: bool | None = False + router_aux_loss_coef: float | None = 0.001 + shared_intermediate_size: int = 0 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) __all__ = ["GraniteMoeSharedConfig"] diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py index 97be2cd8eca5..4fc452c785f3 100644 --- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -13,6 +13,8 @@ # limitations under the License. """Grounding DINO model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="IDEA-Research/grounding-dino-tiny") +@strict(accept_kwargs=True) class GroundingDinoConfig(PreTrainedConfig): r""" num_queries (`int`, *optional*, defaults to 900): @@ -84,116 +87,68 @@ class GroundingDinoConfig(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - backbone_config=None, - text_config=None, - num_queries=900, - encoder_layers=6, - encoder_ffn_dim=2048, - encoder_attention_heads=8, - decoder_layers=6, - decoder_ffn_dim=2048, - decoder_attention_heads=8, - is_encoder_decoder=True, - activation_function="relu", - d_model=256, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - auxiliary_loss=False, - position_embedding_type="sine", - num_feature_levels=4, - encoder_n_points=4, - decoder_n_points=4, - two_stage=True, - class_cost=1.0, - bbox_cost=5.0, - giou_cost=2.0, - bbox_loss_coefficient=5.0, - giou_loss_coefficient=2.0, - focal_alpha=0.25, - disable_custom_kernels=False, - # other parameters - max_text_len=256, - text_enhancer_dropout=0.0, - fusion_droppath=0.1, - fusion_dropout=0.0, - embedding_init_target=True, - query_dim=4, - decoder_bbox_embed_share=True, - two_stage_bbox_embed_share=False, - positional_embedding_temperature=20, - init_std=0.02, - layer_norm_eps=1e-5, - tie_word_embeddings=True, - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + num_queries: int = 900 + encoder_layers: int = 6 + encoder_ffn_dim: int = 2048 + encoder_attention_heads: int = 8 + decoder_layers: int = 6 + decoder_ffn_dim: int = 2048 + decoder_attention_heads: int = 8 + is_encoder_decoder: bool = True + activation_function: str = "relu" + d_model: int = 256 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + auxiliary_loss: bool = False + position_embedding_type: str = "sine" + num_feature_levels: int = 4 + encoder_n_points: int = 4 + decoder_n_points: int = 4 + two_stage: int = True + class_cost: float = 1.0 + bbox_cost: float = 5.0 + giou_cost: float = 2.0 + bbox_loss_coefficient: float = 5.0 + giou_loss_coefficient: float = 2.0 + focal_alpha: float = 0.25 + disable_custom_kernels: bool = False + max_text_len: int = 256 + text_enhancer_dropout: float | int = 0.0 + fusion_droppath: float = 0.1 + fusion_dropout: float | int = 0.0 + embedding_init_target: bool = True + query_dim: int = 4 + decoder_bbox_embed_share: bool = True + two_stage_bbox_embed_share: bool = False + positional_embedding_temperature: int = 20 + init_std: float = 0.02 + layer_norm_eps: float = 1e-5 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="swin", default_config_kwargs={"out_indices": [2, 3, 4]}, **kwargs, ) - self.backbone_config = backbone_config - self.num_queries = num_queries - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.auxiliary_loss = auxiliary_loss - self.position_embedding_type = position_embedding_type - # deformable attributes - self.num_feature_levels = num_feature_levels - self.encoder_n_points = encoder_n_points - self.decoder_n_points = decoder_n_points - self.two_stage = two_stage - # Hungarian matcher - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - # Loss coefficients - self.bbox_loss_coefficient = bbox_loss_coefficient - self.giou_loss_coefficient = giou_loss_coefficient - self.focal_alpha = focal_alpha - self.disable_custom_kernels = disable_custom_kernels - # Text backbone - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "bert") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["bert"]() + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "bert") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["bert"]() logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).") - self.text_config = text_config - self.max_text_len = max_text_len - - # Text Enhancer - self.text_enhancer_dropout = text_enhancer_dropout - # Fusion - self.fusion_droppath = fusion_droppath - self.fusion_dropout = fusion_dropout - # Others - self.embedding_init_target = embedding_init_target - self.query_dim = query_dim - self.decoder_bbox_embed_share = decoder_bbox_embed_share - self.two_stage_bbox_embed_share = two_stage_bbox_embed_share - if two_stage_bbox_embed_share and not decoder_bbox_embed_share: - raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.") - self.positional_embedding_temperature = positional_embedding_temperature - self.init_std = init_std - self.layer_norm_eps = layer_norm_eps - self.tie_word_embeddings = tie_word_embeddings + super().__post_init__(**kwargs) - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.two_stage_bbox_embed_share and not self.decoder_bbox_embed_share: + raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.") __all__ = ["GroundingDinoConfig"] diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 26952ada4894..953a9c7b0250 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1525,7 +1525,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict reference_points = self.get_reference_points(spatial_shapes_list, valid_ratios, device=vision_features.device) @@ -1676,7 +1676,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if inputs_embeds is not None: hidden_states = inputs_embeds @@ -2074,7 +2074,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids) @@ -2500,7 +2500,7 @@ def forward( Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44] Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if attention_mask is None: attention_mask = torch.ones_like(input_ids) diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py index c2516d7fa01b..262100f98663 100644 --- a/src/transformers/models/groupvit/configuration_groupvit.py +++ b/src/transformers/models/groupvit/configuration_groupvit.py @@ -13,6 +13,8 @@ # limitations under the License. """GroupViT model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="nvidia/groupvit-gcc-yfcc") +@strict(accept_kwargs=True) class GroupViTTextConfig(PreTrainedConfig): r""" Example: @@ -40,45 +43,25 @@ class GroupViTTextConfig(PreTrainedConfig): model_type = "groupvit_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=49408, - hidden_size=256, - intermediate_size=1024, - num_hidden_layers=12, - num_attention_heads=4, - max_position_embeddings=77, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - dropout=0.0, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - pad_token_id=1, - bos_token_id=49406, - eos_token_id=49407, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.dropout = dropout - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout + vocab_size: int = 49408 + hidden_size: int = 256 + intermediate_size: int = 1024 + num_hidden_layers: int = 12 + num_attention_heads: int = 4 + max_position_embeddings: int = 77 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 + pad_token_id: int | None = 1 + bos_token_id: int | None = 49406 + eos_token_id: int | None = 49407 @auto_docstring(checkpoint="nvidia/groupvit-gcc-yfcc") +@strict(accept_kwargs=True) class GroupViTVisionConfig(PreTrainedConfig): r""" depths (`list[int]`, *optional*, defaults to [6, 3, 3]): @@ -109,87 +92,81 @@ class GroupViTVisionConfig(PreTrainedConfig): model_type = "groupvit_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=384, - intermediate_size=1536, - depths=[6, 3, 3], - num_hidden_layers=12, - num_group_tokens=[64, 8, 0], - num_output_groups=[64, 8, 8], - num_attention_heads=6, - image_size=224, - patch_size=16, - num_channels=3, - hidden_act="gelu", - layer_norm_eps=1e-5, - dropout=0.0, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - assign_eps=1.0, - assign_mlp_ratio=[0.5, 4], - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.depths = depths - if num_hidden_layers != sum(depths): + hidden_size: int = 384 + intermediate_size: int = 1536 + num_hidden_layers: int = 12 + depths: list[int] | tuple[int, ...] = (6, 3, 3) + num_group_tokens: list[int] | tuple[int, ...] = (64, 8, 0) + num_output_groups: list[int] | tuple[int, ...] = (64, 8, 8) + num_attention_heads: int = 6 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-5 + dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 + assign_eps: float = 1.0 + assign_mlp_ratio: list[float | int] | tuple[float | int, ...] = (0.5, 4) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.num_hidden_layers != sum(self.depths): logger.warning( - f"Manually setting num_hidden_layers to {num_hidden_layers}, but we expect num_hidden_layers =" - f" sum(depth) = {sum(depths)}" + f"Manually setting num_hidden_layers to {self.num_hidden_layers}, but we expect num_hidden_layers =" + f" sum(depth) = {sum(self.depths)}" ) - self.num_hidden_layers = num_hidden_layers - self.num_group_tokens = num_group_tokens - self.num_output_groups = num_output_groups - self.num_attention_heads = num_attention_heads - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.dropout = dropout - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.assign_eps = assign_eps - self.assign_mlp_ratio = assign_mlp_ratio @auto_docstring(checkpoint="nvidia/groupvit-gcc-yfcc") +@strict(accept_kwargs=True) class GroupViTConfig(PreTrainedConfig): r""" projection_intermediate_dim (`int`, *optional*, defaults to 4096): Dimensionality of intermediate layer of text and vision projection layers. + output_segmentation (`bool`, *optional*, defaults to False): + Whether or not to return the segmentation logits. """ model_type = "groupvit" sub_configs = {"text_config": GroupViTTextConfig, "vision_config": GroupViTVisionConfig} - def __init__( - self, - text_config=None, - vision_config=None, - projection_dim=256, - projection_intermediate_dim=4096, - logit_scale_init_value=2.6592, - **kwargs, - ): - # If `_config_dict` exist, we use them for the backward compatibility. - # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot - # of confusion!). - text_config_dict = kwargs.pop("text_config_dict", None) - vision_config_dict = kwargs.pop("vision_config_dict", None) + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + projection_dim: int = 256 + projection_intermediate_dim: int = 4096 + logit_scale_init_value: float = 2.6592 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 + output_segmentation: bool = False + + def __post_init__(self, **kwargs): + if self.text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `GroupViTTextConfig` with default values.") + elif isinstance(self.text_config, GroupViTTextConfig): + text_config = self.text_config.to_dict() + else: + text_config = self.text_config + + if self.vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. initializing the `GroupViTVisionConfig` with default values.") + elif isinstance(self.vision_config, GroupViTVisionConfig): + vision_config = self.vision_config.to_dict() + else: + vision_config = self.vision_config + # For backward compatibility check keyword args # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. - if text_config_dict is not None: - if text_config is None: - text_config = {} + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + if text_config_dict is not None: # This is the complete result when using `text_config_dict`. _text_config_dict = GroupViTTextConfig(**text_config_dict).to_dict() @@ -205,8 +182,8 @@ def __init__( # If inferred from default argument values (just to be super careful) else: message = ( - f"`text_config_dict` is provided which will be used to initialize `GroupViTTextConfig`. " - f'The value `text_config["{key}"]` will be overridden.' + f"`text_config_dict` is provided which will be used to initialize `GroupViTTextConfig`. The " + f'value `text_config["{key}"]` will be overridden.' ) logger.info(message) @@ -214,9 +191,6 @@ def __init__( text_config.update(_text_config_dict) if vision_config_dict is not None: - if vision_config is None: - vision_config = {} - # This is the complete result when using `vision_config_dict`. _vision_config_dict = GroupViTVisionConfig(**vision_config_dict).to_dict() # convert keys to string instead of integer @@ -237,36 +211,19 @@ def __init__( # If inferred from default argument values (just to be super careful) else: message = ( - f"`vision_config_dict` is provided which will be used to initialize `GroupViTVisionConfig`." - f' The value `vision_config["{key}"]` will be overridden.' + f"`vision_config_dict` is provided which will be used to initialize `GroupViTVisionConfig`. " + f'The value `vision_config["{key}"]` will be overridden.' ) logger.info(message) # Update all values in `vision_config` with the ones in `_vision_config_dict`. vision_config.update(_vision_config_dict) - if text_config is None: - text_config = GroupViTTextConfig() - logger.info("`text_config` is `None`. initializing the `GroupViTTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = GroupViTTextConfig(**text_config) + # Finally we can convert back our unified text/vision configs to `PretrainedConfig` + self.text_config = GroupViTTextConfig(**text_config) + self.vision_config = GroupViTVisionConfig(**vision_config) - if vision_config is None: - vision_config = GroupViTVisionConfig() - logger.info("`vision_config` is `None`. initializing the `GroupViTVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = GroupViTVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - - self.projection_dim = projection_dim - self.projection_intermediate_dim = projection_intermediate_dim - self.logit_scale_init_value = logit_scale_init_value - self.initializer_range = 0.02 - self.initializer_factor = 1.0 - self.output_segmentation = False - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["GroupViTConfig", "GroupViTTextConfig", "GroupViTVisionConfig"] diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index c4ce1b6de525..f765c7bbef7c 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -317,7 +317,7 @@ class GroupViTPatchEmbeddings(nn.Module): def __init__( self, - image_size: int = 224, + image_size: int | list[int] | tuple[int, int] = 224, patch_size: int | tuple[int, int] = 16, num_channels: int = 3, embed_dim: int = 768, @@ -790,7 +790,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict all_hidden_states = () if output_hidden_states else None all_groupings = () if output_attentions else None @@ -1014,7 +1014,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index c544b11e8797..2ec4370271ec 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -14,12 +14,15 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="kyutai/helium-1-preview") +@strict(accept_kwargs=True) class HeliumConfig(PreTrainedConfig): r""" Example: @@ -52,52 +55,26 @@ class HeliumConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 48000, - hidden_size: int | None = 2560, - intermediate_size: int | None = 7040, - num_hidden_layers: int | None = 24, - num_attention_heads: int | None = 20, - num_key_value_heads: int | None = 20, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - attention_dropout: float | None = 0.0, - max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-8, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - pad_token_id: int | None = 3, - eos_token_id: int | None = 2, - bos_token_id: int | None = 1, - attention_bias: bool | None = False, - mlp_bias: bool | None = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.head_dim = head_dim - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 48000 + hidden_size: int = 2560 + intermediate_size: int = 7040 + num_hidden_layers: int = 24 + num_attention_heads: int = 20 + num_key_value_heads: int = 20 + head_dim: int = 128 + hidden_act: str = "silu" + attention_dropout: float | int = 0.0 + max_position_embeddings: int = 4096 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-8 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + pad_token_id: int | None = 3 + eos_token_id: int | list[int] | None = 2 + bos_token_id: int | None = 1 + attention_bias: bool = False + mlp_bias: bool = False __all__ = ["HeliumConfig"] diff --git a/src/transformers/models/hgnet_v2/configuration_hgnet_v2.py b/src/transformers/models/hgnet_v2/configuration_hgnet_v2.py index 458029813a0b..e9bd3e348c7c 100644 --- a/src/transformers/models/hgnet_v2/configuration_hgnet_v2.py +++ b/src/transformers/models/hgnet_v2/configuration_hgnet_v2.py @@ -19,6 +19,8 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -27,6 +29,7 @@ # TODO: Modular conversion for resnet must be fixed as # it provides incorrect import for configuration like resnet_resnet @auto_docstring(checkpoint="ustc-community/dfine_x_coco") +@strict(accept_kwargs=True) class HGNetV2Config(BackboneConfigMixin, PreTrainedConfig): """ stem_channels (`list[int]`, *optional*, defaults to `[3, 32, 48]`): @@ -63,57 +66,44 @@ class HGNetV2Config(BackboneConfigMixin, PreTrainedConfig): model_type = "hgnet_v2" - def __init__( - self, - num_channels=3, - embedding_size=64, - depths=[3, 4, 6, 3], - hidden_sizes=[256, 512, 1024, 2048], - hidden_act="relu", - out_features=None, - out_indices=None, - stem_channels=[3, 32, 48], - stage_in_channels=[48, 128, 512, 1024], - stage_mid_channels=[48, 96, 192, 384], - stage_out_channels=[128, 512, 1024, 2048], - stage_num_blocks=[1, 1, 3, 1], - stage_downsample=[False, True, True, True], - stage_light_block=[False, False, True, True], - stage_kernel_size=[3, 3, 5, 5], - stage_numb_of_layers=[6, 6, 6, 6], - use_learnable_affine_block=False, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.num_channels = num_channels - self.embedding_size = embedding_size - self.depths = depths - self.hidden_sizes = hidden_sizes - self.hidden_act = hidden_act - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) - self.stem_channels = stem_channels - self.stage_in_channels = stage_in_channels - self.stage_mid_channels = stage_mid_channels - self.stage_out_channels = stage_out_channels - self.stage_num_blocks = stage_num_blocks - self.stage_downsample = stage_downsample - self.stage_light_block = stage_light_block - self.stage_kernel_size = stage_kernel_size - self.stage_numb_of_layers = stage_numb_of_layers - self.use_learnable_affine_block = use_learnable_affine_block - self.initializer_range = initializer_range + num_channels: int = 3 + embedding_size: int = 64 + depths: list[int] | tuple[int, ...] = (3, 4, 6, 3) + hidden_sizes: list[int] | tuple[int, ...] = (256, 512, 1024, 2048) + hidden_act: str = "relu" + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + stem_channels: list[int] | tuple[int, ...] = (3, 32, 48) + stage_in_channels: list[int] | tuple[int, ...] = (48, 128, 512, 1024) + stage_mid_channels: list[int] | tuple[int, ...] = (48, 96, 192, 384) + stage_out_channels: list[int] | tuple[int, ...] = (128, 512, 1024, 2048) + stage_num_blocks: list[int] | tuple[int, ...] = (1, 1, 3, 1) + stage_downsample: list[bool] | tuple[bool, ...] = (False, True, True, True) + stage_light_block: list[bool] | tuple[bool, ...] = (False, False, True, True) + stage_kernel_size: list[int] | tuple[int, ...] = (3, 3, 5, 5) + stage_numb_of_layers: list[int] | tuple[int, ...] = (6, 6, 6, 6) + use_learnable_affine_block: bool = False + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + self.hidden_sizes = list(self.hidden_sizes) + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if not ( - len(stage_in_channels) - == len(stage_mid_channels) - == len(stage_out_channels) - == len(stage_num_blocks) - == len(stage_downsample) - == len(stage_light_block) - == len(stage_kernel_size) - == len(stage_numb_of_layers) + len(self.stage_in_channels) + == len(self.stage_mid_channels) + == len(self.stage_out_channels) + == len(self.stage_num_blocks) + == len(self.stage_downsample) + == len(self.stage_light_block) + == len(self.stage_kernel_size) + == len(self.stage_numb_of_layers) ): raise ValueError("All stage configuration lists must have the same length.") diff --git a/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py b/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py index c517eea90e64..4d91bd0031fa 100644 --- a/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py +++ b/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py @@ -381,7 +381,7 @@ def forward( >>> list(feature_maps[-1].shape) [1, 2048, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -468,7 +468,7 @@ def forward( >>> outputs.logits.shape torch.Size([1, 2]) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/hgnet_v2/modular_hgnet_v2.py b/src/transformers/models/hgnet_v2/modular_hgnet_v2.py index 9a97c0ed9e50..3fad2b531241 100644 --- a/src/transformers/models/hgnet_v2/modular_hgnet_v2.py +++ b/src/transformers/models/hgnet_v2/modular_hgnet_v2.py @@ -15,6 +15,7 @@ import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import Tensor, nn from ... import initialization as init @@ -36,6 +37,7 @@ # TODO: Modular conversion for resnet must be fixed as # it provides incorrect import for configuration like resnet_resnet @auto_docstring(checkpoint="ustc-community/dfine_x_coco") +@strict(accept_kwargs=True) class HGNetV2Config(BackboneConfigMixin, PreTrainedConfig): """ stem_channels (`list[int]`, *optional*, defaults to `[3, 32, 48]`): @@ -72,57 +74,44 @@ class HGNetV2Config(BackboneConfigMixin, PreTrainedConfig): model_type = "hgnet_v2" - def __init__( - self, - num_channels=3, - embedding_size=64, - depths=[3, 4, 6, 3], - hidden_sizes=[256, 512, 1024, 2048], - hidden_act="relu", - out_features=None, - out_indices=None, - stem_channels=[3, 32, 48], - stage_in_channels=[48, 128, 512, 1024], - stage_mid_channels=[48, 96, 192, 384], - stage_out_channels=[128, 512, 1024, 2048], - stage_num_blocks=[1, 1, 3, 1], - stage_downsample=[False, True, True, True], - stage_light_block=[False, False, True, True], - stage_kernel_size=[3, 3, 5, 5], - stage_numb_of_layers=[6, 6, 6, 6], - use_learnable_affine_block=False, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.num_channels = num_channels - self.embedding_size = embedding_size - self.depths = depths - self.hidden_sizes = hidden_sizes - self.hidden_act = hidden_act - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) - self.stem_channels = stem_channels - self.stage_in_channels = stage_in_channels - self.stage_mid_channels = stage_mid_channels - self.stage_out_channels = stage_out_channels - self.stage_num_blocks = stage_num_blocks - self.stage_downsample = stage_downsample - self.stage_light_block = stage_light_block - self.stage_kernel_size = stage_kernel_size - self.stage_numb_of_layers = stage_numb_of_layers - self.use_learnable_affine_block = use_learnable_affine_block - self.initializer_range = initializer_range + num_channels: int = 3 + embedding_size: int = 64 + depths: list[int] | tuple[int, ...] = (3, 4, 6, 3) + hidden_sizes: list[int] | tuple[int, ...] = (256, 512, 1024, 2048) + hidden_act: str = "relu" + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + stem_channels: list[int] | tuple[int, ...] = (3, 32, 48) + stage_in_channels: list[int] | tuple[int, ...] = (48, 128, 512, 1024) + stage_mid_channels: list[int] | tuple[int, ...] = (48, 96, 192, 384) + stage_out_channels: list[int] | tuple[int, ...] = (128, 512, 1024, 2048) + stage_num_blocks: list[int] | tuple[int, ...] = (1, 1, 3, 1) + stage_downsample: list[bool] | tuple[bool, ...] = (False, True, True, True) + stage_light_block: list[bool] | tuple[bool, ...] = (False, False, True, True) + stage_kernel_size: list[int] | tuple[int, ...] = (3, 3, 5, 5) + stage_numb_of_layers: list[int] | tuple[int, ...] = (6, 6, 6, 6) + use_learnable_affine_block: bool = False + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + self.hidden_sizes = list(self.hidden_sizes) + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if not ( - len(stage_in_channels) - == len(stage_mid_channels) - == len(stage_out_channels) - == len(stage_num_blocks) - == len(stage_downsample) - == len(stage_light_block) - == len(stage_kernel_size) - == len(stage_numb_of_layers) + len(self.stage_in_channels) + == len(self.stage_mid_channels) + == len(self.stage_out_channels) + == len(self.stage_num_blocks) + == len(self.stage_downsample) + == len(self.stage_light_block) + == len(self.stage_kernel_size) + == len(self.stage_numb_of_layers) ): raise ValueError("All stage configuration lists must have the same length.") @@ -473,7 +462,7 @@ def forward( >>> list(feature_maps[-1].shape) [1, 2048, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -560,7 +549,7 @@ def forward( >>> outputs.logits.shape torch.Size([1, 2]) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/hiera/configuration_hiera.py b/src/transformers/models/hiera/configuration_hiera.py index 0d60ad75d942..0409d5a8e19f 100644 --- a/src/transformers/models/hiera/configuration_hiera.py +++ b/src/transformers/models/hiera/configuration_hiera.py @@ -13,15 +13,15 @@ # limitations under the License. """Hiera model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/hiera-base-224") +@strict(accept_kwargs=True) class HieraConfig(BackboneConfigMixin, PreTrainedConfig): r""" layer_norm_init (`float`, *optional*, defaults to 1.0): @@ -68,78 +68,55 @@ class HieraConfig(BackboneConfigMixin, PreTrainedConfig): attribute_map = {"num_hidden_layers": "num_layers"} - def __init__( - self, - embed_dim=96, - image_size=[224, 224], - patch_size=[7, 7], - patch_stride=[4, 4], - patch_padding=[3, 3], - mlp_ratio=4.0, - depths=[2, 3, 16, 3], - num_heads=[1, 2, 4, 8], - embed_dim_multiplier=2.0, - num_query_pool=3, - query_stride=[2, 2], - masked_unit_size=[8, 8], - masked_unit_attention=[True, True, False, False], - drop_path_rate=0.0, - num_channels=3, - hidden_act="gelu", - initializer_range=0.02, - layer_norm_init=1.0, - layer_norm_eps=1e-6, - decoder_hidden_size=None, - decoder_depth=None, - decoder_num_heads=None, - normalize_pixel_loss=True, - mask_ratio=0.6, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) - if masked_unit_size[0] % query_stride[0] ** (len(depths) - 1) != 0: + embed_dim: int = 96 + image_size: list[int] | tuple[int, ...] = (224, 224) + patch_size: list[int] | tuple[int, ...] = (7, 7) + patch_stride: list[int] | tuple[int, ...] = (4, 4) + patch_padding: list[int] | tuple[int, ...] = (3, 3) + mlp_ratio: float = 4.0 + depths: list[int] | tuple[int, ...] = (2, 3, 16, 3) + num_heads: list[int] | tuple[int, ...] = (1, 2, 4, 8) + embed_dim_multiplier: float = 2.0 + num_query_pool: int = 3 + query_stride: list[int] | tuple[int, ...] = (2, 2) + masked_unit_size: list[int] | tuple[int, ...] = (8, 8) + masked_unit_attention: list[bool] | tuple[bool, ...] = (True, True, False, False) + drop_path_rate: float = 0.0 + num_channels: int = 3 + hidden_act: str = "gelu" + initializer_range: float = 0.02 + layer_norm_init: float = 1.0 + layer_norm_eps: float = 1e-6 + decoder_hidden_size: int | None = None + decoder_depth: int | None = None + decoder_num_heads: int | None = None + normalize_pixel_loss: bool | None = True + mask_ratio: float = 0.6 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + + def __post_init__(self, **kwargs): + # we set the hidden_size attribute in order to make Hiera work with VisionEncoderDecoderModel + # this indicates the channel dimension after the last stage of the model + self.hidden_size = int(self.embed_dim * self.embed_dim_multiplier ** (len(self.depths) - 1)) + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.masked_unit_size[0] % self.query_stride[0] ** (len(self.depths) - 1) != 0: raise ValueError( - f"masked_unit_size[0] ({masked_unit_size[0]}) must be divisible by query_stride[0] ({query_stride[0]}) " - f"raised to the power of the number of layers ({len(depths) - 1})" + f"masked_unit_size[0] ({self.masked_unit_size[0]}) must be divisible by query_stride[0] ({self.query_stride[0]}) " + f"raised to the power of the number of layers ({len(self.depths) - 1})" ) - if num_query_pool >= len(depths): + if self.num_query_pool >= len(self.depths): raise ValueError( - f"num_query_pool ({num_query_pool}) must be less than the number of layers ({len(depths)})" + f"num_query_pool ({self.num_query_pool}) must be less than the number of layers ({len(self.depths)})" ) - self.embed_dim = embed_dim - self.image_size = image_size - self.patch_size = patch_size - self.patch_stride = patch_stride - self.patch_padding = patch_padding - self.mlp_ratio = mlp_ratio - self.depths = depths - self.num_heads = num_heads - self.num_layers = len(depths) - self.embed_dim_multiplier = embed_dim_multiplier - self.num_query_pool = num_query_pool - self.query_stride = query_stride - self.masked_unit_size = masked_unit_size - self.masked_unit_attention = masked_unit_attention - self.drop_path_rate = drop_path_rate - self.num_channels = num_channels - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_init = layer_norm_init - self.layer_norm_eps = layer_norm_eps - self.decoder_hidden_size = decoder_hidden_size - self.decoder_depth = decoder_depth - self.decoder_num_heads = decoder_num_heads - self.normalize_pixel_loss = normalize_pixel_loss - self.mask_ratio = mask_ratio - # we set the hidden_size attribute in order to make Hiera work with VisionEncoderDecoderModel - # this indicates the channel dimension after the last stage of the model - self.hidden_size = int(embed_dim * embed_dim_multiplier ** (len(depths) - 1)) - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) - __all__ = ["HieraConfig"] diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index c962d096d847..59386c69b211 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -857,7 +857,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1161,7 +1161,7 @@ def forward( >>> print(list(logits.shape)) [1, 196, 768] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1260,7 +1260,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1359,7 +1359,7 @@ def forward( >>> list(feature_maps[-1].shape) [1, 768, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/higgs_audio_v2/configuration_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/configuration_higgs_audio_v2.py index 369095719298..4cc4984a20aa 100644 --- a/src/transformers/models/higgs_audio_v2/configuration_higgs_audio_v2.py +++ b/src/transformers/models/higgs_audio_v2/configuration_higgs_audio_v2.py @@ -19,11 +19,16 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring +from ...utils.type_validators import interval @auto_docstring(checkpoint="bosonai/higgs-audio-v2-generation-3B-base") +@strict(accept_kwargs=True) class HiggsAudioV2Config(PreTrainedConfig): r""" audio_bos_token_id (`int`, *optional*, defaults to 128013): @@ -68,80 +73,59 @@ class HiggsAudioV2Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size=128256, - hidden_size=3072, - intermediate_size=8192, - num_hidden_layers=28, - num_attention_heads=24, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-05, - use_cache=True, - pad_token_id=128001, - bos_token_id=1, - eos_token_id=128009, - pretraining_tp=1, - tie_word_embeddings=False, - rope_parameters={ - "factor": 32.0, - "rope_theta": 500000.0, - "high_freq_factor": 0.5, - "low_freq_factor": 0.125, - "original_max_position_embeddings": 1024, - "rope_type": "llama3", - }, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=128, - num_codebooks=8, - codebook_size=1024, - audio_token_id=128016, - audio_bos_token_id=128013, - audio_delay_token_id=128014, - audio_stream_bos_id=1024, - audio_stream_eos_id=1025, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) - self.num_codebooks = num_codebooks - self.codebook_size = codebook_size - self.audio_token_id = audio_token_id - self.audio_bos_token_id = audio_bos_token_id - self.audio_delay_token_id = audio_delay_token_id - self.audio_stream_bos_id = audio_stream_bos_id - self.audio_stream_eos_id = audio_stream_eos_id + vocab_size: int = 128256 + hidden_size: int = 3072 + intermediate_size: int = 8192 + num_hidden_layers: int = 28 + num_attention_heads: int = 24 + num_key_value_heads: int = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = interval(min=0.0, max=1.0)(default=0.02) + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = 128001 + bos_token_id: int | None = 1 + eos_token_id: int | None = 128009 + pretraining_tp: int | None = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + mlp_bias: bool = False + head_dim: int | None = 128 + num_codebooks: int = 8 + codebook_size: int = 1024 + audio_token_id: int = 128016 + audio_bos_token_id: int = 128013 + audio_delay_token_id: int = 128014 + audio_stream_bos_id: int = 1024 + audio_stream_eos_id: int = 1025 + + def __post_init__(self, **kwargs): + if self.rope_parameters is None: + self.rope_parameters = { + "factor": 32.0, + "rope_theta": 500000.0, + "high_freq_factor": 0.5, + "low_freq_factor": 0.125, + "original_max_position_embeddings": 1024, + "rope_type": "llama3", + } + if self.head_dim is None: + self.head_dim = self.hidden_size // self.num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) __all__ = ["HiggsAudioV2Config"] diff --git a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py index 22e09f6bc985..f7430e850dc0 100644 --- a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py +++ b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py @@ -15,6 +15,7 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...cache_utils import Cache, DynamicCache @@ -39,6 +40,7 @@ @auto_docstring(checkpoint="bosonai/higgs-audio-v2-generation-3B-base") +@strict(accept_kwargs=True) class HiggsAudioV2Config(LlamaConfig): r""" audio_bos_token_id (`int`, *optional*, defaults to 128013): @@ -65,76 +67,35 @@ class HiggsAudioV2Config(LlamaConfig): >>> configuration = model.config ```""" - def __init__( - self, - vocab_size=128256, - hidden_size=3072, - intermediate_size=8192, - num_hidden_layers=28, - num_attention_heads=24, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-05, - use_cache=True, - pad_token_id=128001, - bos_token_id=1, - eos_token_id=128009, - pretraining_tp=1, - tie_word_embeddings=False, - rope_parameters={ - "factor": 32.0, - "rope_theta": 500000.0, - "high_freq_factor": 0.5, - "low_freq_factor": 0.125, - "original_max_position_embeddings": 1024, - "rope_type": "llama3", - }, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=128, - num_codebooks=8, - codebook_size=1024, - audio_token_id=128016, - audio_bos_token_id=128013, - audio_delay_token_id=128014, - audio_stream_bos_id=1024, - audio_stream_eos_id=1025, - **kwargs, - ): - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pretraining_tp=pretraining_tp, - tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - mlp_bias=mlp_bias, - head_dim=head_dim, - **kwargs, - ) - self.num_codebooks = num_codebooks - self.codebook_size = codebook_size - self.audio_token_id = audio_token_id - self.audio_bos_token_id = audio_bos_token_id - self.audio_delay_token_id = audio_delay_token_id - self.audio_stream_bos_id = audio_stream_bos_id - self.audio_stream_eos_id = audio_stream_eos_id + vocab_size: int = 128256 + rms_norm_eps: float = 1e-5 + hidden_size: int = 3072 + intermediate_size: int = 8192 + num_hidden_layers: int = 28 + num_attention_heads: int = 24 + num_key_value_heads: int = 8 + pad_token_id: int | None = 128001 + eos_token_id: int | None = 128009 + head_dim: int | None = 128 + num_codebooks: int = 8 + codebook_size: int = 1024 + audio_token_id: int = 128016 + audio_bos_token_id: int = 128013 + audio_delay_token_id: int = 128014 + audio_stream_bos_id: int = 1024 + audio_stream_eos_id: int = 1025 + + def __post_init__(self, **kwargs): + if self.rope_parameters is None: + self.rope_parameters = { + "factor": 32.0, + "rope_theta": 500000.0, + "high_freq_factor": 0.5, + "low_freq_factor": 0.125, + "original_max_position_embeddings": 1024, + "rope_type": "llama3", + } + super().__post_init__(**kwargs) class HiggsAudioV2MLP(LlamaMLP): diff --git a/src/transformers/models/higgs_audio_v2_tokenizer/configuration_higgs_audio_v2_tokenizer.py b/src/transformers/models/higgs_audio_v2_tokenizer/configuration_higgs_audio_v2_tokenizer.py index 423241c83306..511aae74aa90 100644 --- a/src/transformers/models/higgs_audio_v2_tokenizer/configuration_higgs_audio_v2_tokenizer.py +++ b/src/transformers/models/higgs_audio_v2_tokenizer/configuration_higgs_audio_v2_tokenizer.py @@ -22,6 +22,7 @@ import math import numpy as np +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -29,6 +30,7 @@ @auto_docstring(checkpoint="bosonai/higgs-audio-v2-tokenizer") +@strict(accept_kwargs=True) class HiggsAudioV2TokenizerConfig(PreTrainedConfig): r""" target_bandwidths (`List[float]`, *optional*, defaults to `[0.5, 1, 1.5, 2]`): @@ -88,62 +90,50 @@ class HiggsAudioV2TokenizerConfig(PreTrainedConfig): "mask_time_prob": 0.0, } - def __init__( - self, - target_bandwidths=[0.5, 1, 1.5, 2], - sample_rate=24000, - kernel_size=3, - channel_ratios=[1, 1], - strides=[1, 1], - block_dilations=[1, 1], - unit_kernel_size=3, - codebook_size=1024, - codebook_dim=64, - initializer_range=0.02, - acoustic_model_config=None, - semantic_model_config=None, - semantic_sample_rate=16000, - downsample_factor=320, - **kwargs, - ): - if isinstance(acoustic_model_config, dict): - acoustic_model_config["model_type"] = acoustic_model_config.get("model_type", "dac") - acoustic_model_config = CONFIG_MAPPING[acoustic_model_config["model_type"]]( - **{**self._default_acoustic_model_config_kwargs, **acoustic_model_config} + target_bandwidths: list[int | float] | tuple[int | float, ...] = (0.5, 1, 1.5, 2, 4) + sample_rate: int = 24000 + kernel_size: int = 3 + channel_ratios: list[int] | tuple[int, ...] = (1, 1) + strides: list[int] | tuple[int, ...] = (1, 1) + block_dilations: list[int] | tuple[int, ...] = (1, 1) + unit_kernel_size: int = 3 + codebook_size: int = 1024 + codebook_dim: int = 64 + initializer_range: float = 0.02 + acoustic_model_config: dict | PreTrainedConfig | None = None + semantic_model_config: dict | PreTrainedConfig | None = None + semantic_sample_rate: int = 16000 + downsample_factor: int = 320 + + def __post_init__(self, **kwargs): + if self.acoustic_model_config is None: + self.acoustic_model_config = CONFIG_MAPPING["dac"]( + encoder_hidden_size=64, + # NOTE: original DAC uses [2, 4, 8, 8] `downsampling ratios`, namely reverse of `upsampling_ratios` + # (not sure if intentional by HiggsAudioV2Tokenizer but we keep it) + downsampling_ratios=[8, 5, 4, 2], + decoder_hidden_size=1024, + upsampling_ratios=[8, 5, 4, 2], + hidden_size=256, ) - elif acoustic_model_config is None: - acoustic_model_config = CONFIG_MAPPING["dac"](**self._default_acoustic_model_config_kwargs) - self.acoustic_model_config = acoustic_model_config - - if isinstance(semantic_model_config, dict): - semantic_model_config["model_type"] = semantic_model_config.get("model_type", "hubert") - semantic_model_config = CONFIG_MAPPING[semantic_model_config["model_type"]]( - **{**self._default_semantic_model_config_kwargs, **semantic_model_config} + elif isinstance(self.acoustic_model_config, dict): + self.acoustic_model_config["model_type"] = self.acoustic_model_config.get("model_type", "dac") + self.acoustic_model_config = CONFIG_MAPPING[self.acoustic_model_config["model_type"]]( + **{**self._default_acoustic_model_config_kwargs, **self.acoustic_model_config} ) - elif semantic_model_config is None: - semantic_model_config = CONFIG_MAPPING["hubert"](**self._default_semantic_model_config_kwargs) - self.semantic_model_config = semantic_model_config - - if target_bandwidths is None: - target_bandwidths = [0.5, 1, 1.5, 2, 4] - - self.target_bandwidths = target_bandwidths - self.sample_rate = sample_rate - self.kernel_size = kernel_size - self.channel_ratios = channel_ratios - self.strides = strides - self.block_dilations = block_dilations - self.unit_kernel_size = unit_kernel_size - self.codebook_size = codebook_size - self.initializer_range = initializer_range - if codebook_dim is None: - codebook_dim = self.acoustic_model_config.hidden_size + self.semantic_model_config.hidden_size - self.codebook_dim = codebook_dim - - super().__init__(**kwargs) - - self.semantic_sample_rate = semantic_sample_rate - self.downsample_factor = downsample_factor + + if self.semantic_model_config is None: + self.semantic_model_config = CONFIG_MAPPING["hubert"]() + elif isinstance(self.semantic_model_config, dict): + self.semantic_model_config["model_type"] = self.semantic_model_config.get("model_type", "hubert") + self.semantic_model_config = CONFIG_MAPPING[self.semantic_model_config["model_type"]]( + **{**self._default_semantic_model_config_kwargs, **self.semantic_model_config} + ) + + if self.codebook_dim is None: + self.codebook_dim = self.acoustic_model_config.hidden_size + self.semantic_model_config.hidden_size + + super().__post_init__(**kwargs) @property def frame_rate(self) -> int: diff --git a/src/transformers/models/higgs_audio_v2_tokenizer/modeling_higgs_audio_v2_tokenizer.py b/src/transformers/models/higgs_audio_v2_tokenizer/modeling_higgs_audio_v2_tokenizer.py index 445967939690..fd93e72ca013 100644 --- a/src/transformers/models/higgs_audio_v2_tokenizer/modeling_higgs_audio_v2_tokenizer.py +++ b/src/transformers/models/higgs_audio_v2_tokenizer/modeling_higgs_audio_v2_tokenizer.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import math from dataclasses import dataclass from functools import lru_cache diff --git a/src/transformers/models/higgs_audio_v2_tokenizer/modular_higgs_audio_v2_tokenizer.py b/src/transformers/models/higgs_audio_v2_tokenizer/modular_higgs_audio_v2_tokenizer.py index ba30e116e0ba..2d21b906c3f9 100644 --- a/src/transformers/models/higgs_audio_v2_tokenizer/modular_higgs_audio_v2_tokenizer.py +++ b/src/transformers/models/higgs_audio_v2_tokenizer/modular_higgs_audio_v2_tokenizer.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. + import torch import torch.nn as nn import torch.nn.functional as F import torchaudio +from huggingface_hub.dataclasses import strict from ...utils import auto_docstring from ...utils.import_utils import requires @@ -24,6 +26,7 @@ @auto_docstring(checkpoint="bosonai/higgs-audio-v2-tokenizer") +@strict(accept_kwargs=True) class HiggsAudioV2TokenizerConfig(XcodecConfig): r""" target_bandwidths (`List[float]`, *optional*, defaults to `[0.5, 1, 1.5, 2]`): @@ -66,42 +69,11 @@ class HiggsAudioV2TokenizerConfig(XcodecConfig): "mask_time_prob": 0.0, } - def __init__( - self, - target_bandwidths=[0.5, 1, 1.5, 2], - sample_rate=24000, - kernel_size=3, - channel_ratios=[1, 1], - strides=[1, 1], - block_dilations=[1, 1], - unit_kernel_size=3, - codebook_size=1024, - codebook_dim=64, - initializer_range=0.02, - acoustic_model_config=None, - semantic_model_config=None, - semantic_sample_rate=16000, - downsample_factor=320, - **kwargs, - ): - super().__init__( - target_bandwidths=target_bandwidths, - sample_rate=sample_rate, - kernel_size=kernel_size, - channel_ratios=channel_ratios, - strides=strides, - block_dilations=block_dilations, - unit_kernel_size=unit_kernel_size, - codebook_size=codebook_size, - codebook_dim=codebook_dim, - initializer_range=initializer_range, - acoustic_model_config=acoustic_model_config, - semantic_model_config=semantic_model_config, - **kwargs, - ) - - self.semantic_sample_rate = semantic_sample_rate - self.downsample_factor = downsample_factor + target_bandwidths: list[int | float] | tuple[int | float, ...] = (0.5, 1, 1.5, 2, 4) + sample_rate: int = 24000 + codebook_dim: int = 64 + semantic_sample_rate: int = 16000 + downsample_factor: int = 320 @property def semantic_downsample_factor(self): diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py index f14b464a39af..016b698ae6a8 100644 --- a/src/transformers/models/hubert/configuration_hubert.py +++ b/src/transformers/models/hubert/configuration_hubert.py @@ -16,14 +16,14 @@ import functools import operator -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/hubert-base-ls960") +@strict(accept_kwargs=True) class HubertConfig(PreTrainedConfig): r""" final_dropout (`float`, *optional*, defaults to 0.1): @@ -121,82 +121,52 @@ class HubertConfig(PreTrainedConfig): model_type = "hubert" - def __init__( - self, - vocab_size=32, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - feat_proj_layer_norm=True, - feat_proj_dropout=0.0, - final_dropout=0.1, - layerdrop=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - feat_extract_norm="group", - feat_extract_activation="gelu", - conv_dim=(512, 512, 512, 512, 512, 512, 512), - conv_stride=(5, 2, 2, 2, 2, 2, 2), - conv_kernel=(10, 3, 3, 3, 3, 2, 2), - conv_bias=False, - num_conv_pos_embeddings=128, - num_conv_pos_embedding_groups=16, - conv_pos_batch_norm=False, - do_stable_layer_norm=False, - apply_spec_augment=True, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - mask_feature_min_masks=0, - ctc_loss_reduction="sum", - ctc_zero_infinity=False, - use_weighted_layer_sum=False, - classifier_proj_size=256, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.hidden_size = hidden_size - self.feat_extract_norm = feat_extract_norm - self.feat_extract_activation = feat_extract_activation - self.conv_dim = list(conv_dim) - self.conv_stride = list(conv_stride) - self.conv_kernel = list(conv_kernel) - self.conv_bias = conv_bias - self.num_conv_pos_embeddings = num_conv_pos_embeddings - self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups - self.conv_pos_batch_norm = conv_pos_batch_norm + vocab_size: int = 32 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + feat_proj_layer_norm: bool = True + feat_proj_dropout: float | int = 0.0 + final_dropout: float | int = 0.1 + layerdrop: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + feat_extract_norm: str = "group" + feat_extract_activation: str = "gelu" + conv_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 512, 512, 512) + conv_stride: list[int] | tuple[int, ...] = (5, 2, 2, 2, 2, 2, 2) + conv_kernel: list[int] | tuple[int, ...] = (10, 3, 3, 3, 3, 2, 2) + conv_bias: bool = False + num_conv_pos_embeddings: int = 128 + num_conv_pos_embedding_groups: int = 16 + conv_pos_batch_norm: bool = False + do_stable_layer_norm: bool = False + apply_spec_augment: bool = True + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + mask_feature_min_masks: int = 0 + ctc_loss_reduction: str = "sum" + ctc_zero_infinity: bool = False + use_weighted_layer_sum: bool = False + classifier_proj_size: int = 256 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + + def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_attention_heads = num_attention_heads - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.feat_proj_layer_norm = feat_proj_layer_norm - self.feat_proj_dropout = feat_proj_dropout - self.final_dropout = final_dropout - self.layerdrop = layerdrop - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.vocab_size = vocab_size - self.do_stable_layer_norm = do_stable_layer_norm - self.use_weighted_layer_sum = use_weighted_layer_sum - self.classifier_proj_size = classifier_proj_size + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if ( (len(self.conv_stride) != self.num_feat_extract_layers) or (len(self.conv_kernel) != self.num_feat_extract_layers) @@ -209,19 +179,6 @@ def __init__( f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." ) - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.apply_spec_augment = apply_spec_augment - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - self.mask_feature_min_masks = mask_feature_min_masks - - # ctc loss - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - @property def inputs_to_logits_ratio(self): return functools.reduce(operator.mul, self.conv_stride, 1) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index e4c72a104b64..c7cc5917702c 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -926,7 +926,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) @@ -1053,7 +1053,7 @@ def forward( All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -1171,7 +1171,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.hubert( diff --git a/src/transformers/models/hubert/modular_hubert.py b/src/transformers/models/hubert/modular_hubert.py index 9a80467f03e6..59a72d3269cb 100644 --- a/src/transformers/models/hubert/modular_hubert.py +++ b/src/transformers/models/hubert/modular_hubert.py @@ -260,7 +260,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index 0bf33f59a109..c5dd8fc76058 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -13,15 +13,15 @@ # limitations under the License. """HunYuanDenseV1 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="tencent/Hunyuan-7B-Instruct") +@strict(accept_kwargs=True) class HunYuanDenseV1Config(PreTrainedConfig): r""" eod_token_id (int, *optional*, defaults to 3): @@ -32,57 +32,32 @@ class HunYuanDenseV1Config(PreTrainedConfig): model_type = "hunyuan_v1_dense" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 290943, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - eod_token_id: int | None = 3, - pretraining_tp: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - head_dim: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 290943 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + eod_token_id: int | None = 3 + pretraining_tp: int = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + head_dim: int | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["HunYuanDenseV1Config"] diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py index b000e68b54fe..ade1f7265412 100644 --- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -13,15 +13,15 @@ # limitations under the License. """HunYuanMoEV1 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="tencent/Hunyuan-A13B-Instruct") +@strict(accept_kwargs=True) class HunYuanMoEV1Config(PreTrainedConfig): r""" eod_token_id (int, *optional*, defaults to 3): @@ -38,64 +38,35 @@ class HunYuanMoEV1Config(PreTrainedConfig): "num_local_experts": "num_experts", } - def __init__( - self, - vocab_size: int | None = 290943, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - eod_token_id: int | None = 3, - sep_token_id: int | None = 4, - pretraining_tp: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - num_experts: int | list = 1, - moe_topk: int | list = 1, - head_dim: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_experts = num_experts - self.moe_topk = moe_topk - - self.head_dim = head_dim - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters + vocab_size: int = 290943 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + eod_token_id: int | None = 3 + sep_token_id: int | None = 4 + pretraining_tp: int = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + num_experts: int = 1 + moe_topk: int = 1 + head_dim: int | None = None - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.sep_token_id = sep_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) def _rope_parameters_validation(self): """ diff --git a/src/transformers/models/ibert/configuration_ibert.py b/src/transformers/models/ibert/configuration_ibert.py index 80f3b111c789..ec0674b31247 100644 --- a/src/transformers/models/ibert/configuration_ibert.py +++ b/src/transformers/models/ibert/configuration_ibert.py @@ -15,14 +15,14 @@ # limitations under the License. """I-BERT configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="kssteven/ibert-roberta-base") +@strict(accept_kwargs=True) class IBertConfig(PreTrainedConfig): r""" type_vocab_size (`int`, *optional*, defaults to 2): @@ -39,46 +39,23 @@ class IBertConfig(PreTrainedConfig): model_type = "ibert" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - quant_mode=False, - force_dequant="none", - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.quant_mode = quant_mode - self.force_dequant = force_dequant + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + quant_mode: bool = False + force_dequant: str = "none" __all__ = ["IBertConfig"] diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index bf66f619b499..58c0dd37d3c1 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -673,7 +673,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -769,7 +769,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.ibert( input_ids, @@ -860,7 +860,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.ibert( input_ids, @@ -965,7 +965,7 @@ def forward( is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1042,7 +1042,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.ibert( input_ids, @@ -1123,7 +1123,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> QuestionAnsweringModelOutput | tuple[torch.FloatTensor]: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.ibert( input_ids, diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py index c793e54191b4..4375025c6560 100644 --- a/src/transformers/models/idefics/configuration_idefics.py +++ b/src/transformers/models/idefics/configuration_idefics.py @@ -18,53 +18,34 @@ # limitations under the License. """Idefics model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="HuggingFaceM4/idefics-9b") +@strict(accept_kwargs=True) class IdeficsVisionConfig(PreTrainedConfig): model_type = "idefics_vision" - attribute_map = { - "hidden_size": "embed_dim", - } - - def __init__( - self, - embed_dim=768, - image_size=224, - intermediate_size=5120, - patch_size=14, - num_hidden_layers=32, - num_attention_heads=16, - num_channels=3, - hidden_act="gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, - ): - self.embed_dim = embed_dim - self.image_size = image_size - self.intermediate_size = intermediate_size - self.patch_size = patch_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.layer_norm_eps = layer_norm_eps - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.hidden_act = hidden_act - - super().__init__(**kwargs) + attribute_map = {"hidden_size": "embed_dim"} + + embed_dim: int = 768 + image_size: int | list[int] | tuple[int, int] = 224 + intermediate_size: int = 5120 + patch_size: int | list[int] | tuple[int, int] = 14 + num_hidden_layers: int = 32 + num_attention_heads: int = 16 + num_channels: int = 3 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 @auto_docstring(checkpoint="HuggingFaceM4/idefics-9b") +@strict(accept_kwargs=True) class IdeficsPerceiverConfig(PreTrainedConfig): r""" use_resampler (`bool`, *optional*, defaults to `False`): @@ -83,27 +64,16 @@ class IdeficsPerceiverConfig(PreTrainedConfig): model_type = "idefics_perciever" - def __init__( - self, - use_resampler=False, - resampler_n_latents=64, - resampler_depth=6, - resampler_n_heads=16, - resampler_head_dim=96, - qk_layer_norms_perceiver=False, - **kwargs, - ): - self.use_resampler = use_resampler - self.resampler_n_latents = resampler_n_latents - self.resampler_depth = resampler_depth - self.resampler_n_heads = resampler_n_heads - self.resampler_head_dim = resampler_head_dim - self.qk_layer_norms_perceiver = qk_layer_norms_perceiver - - super().__init__(**kwargs) + use_resampler: bool = False + resampler_n_latents: int = 64 + resampler_depth: int = 6 + resampler_n_heads: int = 16 + resampler_head_dim: int = 96 + qk_layer_norms_perceiver: bool = False @auto_docstring(checkpoint="HuggingFaceM4/idefics-9b") +@strict(accept_kwargs=True) class IdeficsConfig(PreTrainedConfig): r""" alpha_initializer (`str`, *optional*, defaults to `"zeros"`): @@ -147,89 +117,47 @@ class IdeficsConfig(PreTrainedConfig): model_type = "idefics" sub_configs = {"perceiver_config": IdeficsPerceiverConfig, "vision_config": IdeficsVisionConfig} - def __init__( - self, - vocab_size=32000, - additional_vocab_size=0, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - dropout=0.0, - hidden_act="silu", - initializer_range=0.02, - alpha_initializer="zeros", - alphas_initializer_range=0.0, - alpha_type="float", - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - cross_layer_interval=1, - qk_layer_norms=False, - freeze_text_layers=True, - freeze_text_module_exceptions=[], - freeze_lm_head=False, - freeze_vision_layers=True, - freeze_vision_module_exceptions=[], - use_resampler=False, - vision_config=None, - perceiver_config=None, - **kwargs, - ): - self.vocab_size = vocab_size - self.additional_vocab_size = additional_vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.dropout = dropout - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.alpha_initializer = alpha_initializer - self.alphas_initializer_range = alphas_initializer_range - self.alpha_type = alpha_type - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - - self.cross_layer_interval = cross_layer_interval - self.qk_layer_norms = qk_layer_norms - self.freeze_vision_layers = freeze_vision_layers - - self.freeze_text_layers = freeze_text_layers - self.freeze_text_module_exceptions = freeze_text_module_exceptions - self.freeze_vision_module_exceptions = freeze_vision_module_exceptions - self.freeze_lm_head = freeze_lm_head - - self.use_resampler = use_resampler - - if perceiver_config is None: + vocab_size: int = 32000 + additional_vocab_size: int = 0 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + dropout: float | int = 0.0 + hidden_act: str = "silu" + initializer_range: float = 0.02 + alpha_initializer: str = "zeros" + alphas_initializer_range: float = 0.0 + alpha_type: str = "float" + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + tie_word_embeddings: bool = False + cross_layer_interval: int = 1 + qk_layer_norms: bool = False + freeze_text_layers: bool = True + freeze_text_module_exceptions: list | tuple = () + freeze_lm_head: bool = False + freeze_vision_layers: bool = True + freeze_vision_module_exceptions: list | tuple = () + use_resampler: bool = False + vision_config: dict | PreTrainedConfig | None = None + perceiver_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): + if self.perceiver_config is None: self.perceiver_config = IdeficsPerceiverConfig() - elif isinstance(perceiver_config, dict): - self.perceiver_config = IdeficsPerceiverConfig(**perceiver_config) - elif isinstance(perceiver_config, IdeficsPerceiverConfig): - self.perceiver_config = perceiver_config + elif isinstance(self.perceiver_config, dict): + self.perceiver_config = IdeficsPerceiverConfig(**self.perceiver_config) - if vision_config is None: + if self.vision_config is None: self.vision_config = IdeficsVisionConfig() - elif isinstance(vision_config, dict): - self.vision_config = IdeficsVisionConfig(**vision_config) - elif isinstance(vision_config, IdeficsVisionConfig): - self.vision_config = vision_config - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) - - # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since - # PreTrainedConfig.from_dict first instantiates the class with the config dict and only then - # updates the config object with `kwargs` from from_pretrained, so during the instantiation - # of this object many attributes have default values and haven't yet been overridden. - # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run. + elif isinstance(self.vision_config, dict): + self.vision_config = IdeficsVisionConfig(**self.vision_config) + + super().__post_init__(**kwargs) __all__ = ["IdeficsConfig"] diff --git a/src/transformers/models/idefics2/configuration_idefics2.py b/src/transformers/models/idefics2/configuration_idefics2.py index 966dd618dc6c..54e9b1815fca 100644 --- a/src/transformers/models/idefics2/configuration_idefics2.py +++ b/src/transformers/models/idefics2/configuration_idefics2.py @@ -12,6 +12,8 @@ # limitations under the License. """Idefics2 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="HuggingFaceM4/idefics2-8b") +@strict(accept_kwargs=True) class Idefics2VisionConfig(PreTrainedConfig): r""" Example: @@ -42,37 +45,21 @@ class Idefics2VisionConfig(PreTrainedConfig): model_type = "idefics2_vision" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=32, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_range = initializer_range + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 32 + hidden_act: str = "gelu_pytorch_tanh" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 @auto_docstring(checkpoint="HuggingFaceM4/idefics2-8b") +@strict(accept_kwargs=True) class Idefics2PerceiverConfig(PreTrainedConfig): r""" resampler_n_latents (`int`, *optional*, defaults to 64): @@ -87,39 +74,28 @@ class Idefics2PerceiverConfig(PreTrainedConfig): model_type = "idefics2_perceiver" - def __init__( - self, - hidden_act="silu", - hidden_size=4096, - rms_norm_eps=1e-06, - resampler_n_latents=64, - resampler_depth=3, - resampler_n_heads=16, - resampler_head_dim=96, - num_key_value_heads=4, - attention_dropout=0.0, - initializer_range=0.02, - **kwargs, - ): - self.hidden_act = hidden_act - self.hidden_size = hidden_size - self.rms_norm_eps = rms_norm_eps - self.resampler_n_latents = resampler_n_latents - self.resampler_depth = resampler_depth - self.resampler_n_heads = resampler_n_heads - self.num_key_value_heads = num_key_value_heads - self.resampler_head_dim = resampler_head_dim - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range + hidden_act: str = "silu" + hidden_size: int = 4096 + rms_norm_eps: float = 1e-06 + resampler_n_latents: int = 64 + resampler_depth: int = 3 + resampler_n_heads: int = 16 + resampler_head_dim: int = 96 + num_key_value_heads: int = 4 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if self.num_key_value_heads > self.resampler_n_heads: raise ValueError( f"num_key_value_heads={self.num_key_value_heads} must be less than or equal to" f" resampler_n_heads={self.resampler_n_heads}" ) - super().__init__(**kwargs) @auto_docstring(checkpoint="HuggingFaceM4/idefics2-8b") +@strict(accept_kwargs=True) class Idefics2Config(PreTrainedConfig): r""" perceiver_config (`IdeficsPerceiverConfig` or `dict`, *optional*): @@ -143,49 +119,38 @@ class Idefics2Config(PreTrainedConfig): "vision_config": Idefics2VisionConfig, } - def __init__( - self, - use_cache=True, - image_token_id=32_001, - tie_word_embeddings=False, - vision_config=None, - perceiver_config=None, - text_config=None, - **kwargs, - ): - self.image_token_id = image_token_id - self.use_cache = use_cache - self.tie_word_embeddings = tie_word_embeddings - - if perceiver_config is None: + use_cache: bool = True + image_token_id: int = 32_001 + tie_word_embeddings: bool = False + vision_config: dict | PreTrainedConfig | None = None + perceiver_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): + if self.perceiver_config is None: self.perceiver_config = Idefics2PerceiverConfig() logger.info("perciver_config is None, using default perceiver config") - elif isinstance(perceiver_config, dict): - self.perceiver_config = Idefics2PerceiverConfig(**perceiver_config) - elif isinstance(perceiver_config, Idefics2PerceiverConfig): - self.perceiver_config = perceiver_config + elif isinstance(self.perceiver_config, dict): + self.perceiver_config = Idefics2PerceiverConfig(**self.perceiver_config) - if vision_config is None: + if self.vision_config is None: self.vision_config = Idefics2VisionConfig() logger.info("vision_config is None, using default vision config") - elif isinstance(vision_config, dict): - self.vision_config = Idefics2VisionConfig(**vision_config) - elif isinstance(vision_config, Idefics2VisionConfig): - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "mistral") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: + elif isinstance(self.vision_config, dict): + self.vision_config = Idefics2VisionConfig(**self.vision_config) + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "mistral") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: logger.info("text_config is None, using default text config") - text_config = CONFIG_MAPPING["mistral"]( + self.text_config = CONFIG_MAPPING["mistral"]( max_position_embeddings=4096 * 8, rms_norm_eps=1e-5, # None in the original configuration_mistral, we set it to the unk_token_id pad_token_id=0, ) - self.text_config = text_config if self.text_config.hidden_size != self.perceiver_config.hidden_size: self.perceiver_config.hidden_size = self.text_config.hidden_size self.perceiver_config.rms_norm_eps = self.text_config.rms_norm_eps @@ -194,7 +159,7 @@ def __init__( "In your model's config on the hub, add `hidden_size` and `rms_norm_eps` keys under the `perceiver_config` dict. " ) - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Idefics2Config"] diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py index 1a0c09d66977..db76a3158233 100644 --- a/src/transformers/models/idefics3/configuration_idefics3.py +++ b/src/transformers/models/idefics3/configuration_idefics3.py @@ -12,6 +12,8 @@ # limitations under the License. """Idefics3 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="HuggingFaceM4/Idefics3-8B-Llama3") +@strict(accept_kwargs=True) class Idefics3VisionConfig(PreTrainedConfig): r""" Example: @@ -42,37 +45,21 @@ class Idefics3VisionConfig(PreTrainedConfig): model_type = "idefics3_vision" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1152, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=16, - num_channels=3, - image_size=224, - patch_size=32, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_range = initializer_range + hidden_size: int = 1152 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 16 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 32 + hidden_act: str = "gelu_pytorch_tanh" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 @auto_docstring(checkpoint="HuggingFaceM4/Idefics3-8B-Llama3") +@strict(accept_kwargs=True) class Idefics3Config(PreTrainedConfig): r""" scale_factor (`int`, *optional*, defaults to 2): @@ -92,42 +79,32 @@ class Idefics3Config(PreTrainedConfig): model_type = "idefics3" sub_configs = {"text_config": AutoConfig, "vision_config": Idefics3VisionConfig} - def __init__( - self, - use_cache=True, - image_token_id=128257, - tie_word_embeddings=False, - vision_config=None, - text_config=None, - scale_factor=2, - pad_token_id=128_002, - **kwargs, - ): - self.image_token_id = image_token_id - self.use_cache = use_cache - self.tie_word_embeddings = tie_word_embeddings - - if vision_config is None: + use_cache: bool = True + image_token_id: int = 128257 + tie_word_embeddings: bool = False + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + scale_factor: int = 2 + pad_token_id: int | None = 128_002 + + def __post_init__(self, **kwargs): + if self.vision_config is None: self.vision_config = Idefics3VisionConfig() logger.info("vision_config is None, using default vision config") - elif isinstance(vision_config, dict): - self.vision_config = Idefics3VisionConfig(**vision_config) - elif isinstance(vision_config, Idefics3VisionConfig): - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - logger.info("text_config is None, using default text config") - text_config = CONFIG_MAPPING["llama"]( + elif isinstance(self.vision_config, dict): + self.vision_config = Idefics3VisionConfig(**self.vision_config) + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + logger.info("text_config is None, using default Llama text config") + self.text_config = CONFIG_MAPPING["llama"]( rms_norm_eps=1e-5, - pad_token_id=pad_token_id, + pad_token_id=self.pad_token_id, ) - self.text_config = text_config - self.scale_factor = scale_factor - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Idefics3Config", "Idefics3VisionConfig"] diff --git a/src/transformers/models/ijepa/configuration_ijepa.py b/src/transformers/models/ijepa/configuration_ijepa.py index 82646468f031..cd54a1579dfa 100644 --- a/src/transformers/models/ijepa/configuration_ijepa.py +++ b/src/transformers/models/ijepa/configuration_ijepa.py @@ -13,11 +13,14 @@ # limitations under the License. """I-JEPA model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/ijepa_vith14_1k") +@strict(accept_kwargs=True) class IJepaConfig(PreTrainedConfig): r""" pooler_output_size (`int`, *optional*): @@ -42,42 +45,25 @@ class IJepaConfig(PreTrainedConfig): model_type = "ijepa" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - pooler_output_size=None, - pooler_act="tanh", - **kwargs, - ): - super().__init__(**kwargs) + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + pooler_output_size: int | None = None + pooler_act: str = "tanh" - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size - self.pooler_act = pooler_act + def __post_init__(self, **kwargs): + self.pooler_output_size = self.pooler_output_size if self.pooler_output_size else self.hidden_size + super().__post_init__(**kwargs) __all__ = ["IJepaConfig"] diff --git a/src/transformers/models/imagegpt/configuration_imagegpt.py b/src/transformers/models/imagegpt/configuration_imagegpt.py index 7726b08a6a0e..a7dc8e7a04c3 100644 --- a/src/transformers/models/imagegpt/configuration_imagegpt.py +++ b/src/transformers/models/imagegpt/configuration_imagegpt.py @@ -13,14 +13,14 @@ # limitations under the License. """OpenAI ImageGPT configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="openai/imagegpt-small") +@strict(accept_kwargs=True) class ImageGPTConfig(PreTrainedConfig): r""" scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`): @@ -53,54 +53,27 @@ class ImageGPTConfig(PreTrainedConfig): "num_hidden_layers": "n_layer", } - def __init__( - self, - vocab_size=512 + 1, # add one for start of sentence (sos) token - n_positions=32 * 32, - n_embd=512, - n_layer=24, - n_head=8, - n_inner=None, - activation_function="quick_gelu", - resid_pdrop=0.1, - embd_pdrop=0.1, - attn_pdrop=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - scale_attn_weights=True, - use_cache=True, - tie_word_embeddings=False, - scale_attn_by_inverse_layer_idx=False, - reorder_and_upcast_attn=False, - add_cross_attention=False, - pad_token_id=None, - bos_token_id=None, - eos_token_id=None, - **kwargs, - ): - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.n_inner = n_inner - self.activation_function = activation_function - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.scale_attn_weights = scale_attn_weights - self.use_cache = use_cache - self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx - self.reorder_and_upcast_attn = reorder_and_upcast_attn - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + vocab_size: int = 512 + 1 # add one for start of sentence (sos) token + n_positions: int = 32 * 32 + n_embd: int = 512 + n_layer: int = 24 + n_head: int = 8 + n_inner: int | None = None + activation_function: str = "quick_gelu" + resid_pdrop: float = 0.1 + embd_pdrop: float = 0.1 + attn_pdrop: float = 0.1 + layer_norm_epsilon: float = 1e-5 + initializer_range: float = 0.02 + scale_attn_weights: bool = True + use_cache: bool = True + tie_word_embeddings: bool = False + scale_attn_by_inverse_layer_idx: bool = False + reorder_and_upcast_attn: bool = False + add_cross_attention: bool = False + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None __all__ = ["ImageGPTConfig"] diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py index 373d5554f8d4..47c881ced8f0 100755 --- a/src/transformers/models/imagegpt/modeling_imagegpt.py +++ b/src/transformers/models/imagegpt/modeling_imagegpt.py @@ -470,7 +470,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -684,7 +684,7 @@ def forward( ... ax.imshow(img) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -795,7 +795,7 @@ def forward( >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py index 555ea8ad2b06..584338e5d11e 100644 --- a/src/transformers/models/informer/configuration_informer.py +++ b/src/transformers/models/informer/configuration_informer.py @@ -13,14 +13,14 @@ # limitations under the License. """Informer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="huggingface/informer-tourism-monthly") +@strict(accept_kwargs=True) class InformerConfig(PreTrainedConfig): r""" prediction_length (`int`): @@ -95,106 +95,72 @@ class InformerConfig(PreTrainedConfig): "initializer_range": "init_std", } - def __init__( - self, - prediction_length: int | None = None, - context_length: int | None = None, - distribution_output: str = "student_t", - loss: str = "nll", - input_size: int = 1, - lags_sequence: list[int] | None = None, - scaling: str | bool | None = "mean", - num_dynamic_real_features: int = 0, - num_static_real_features: int = 0, - num_static_categorical_features: int = 0, - num_time_features: int = 0, - cardinality: list[int] | None = None, - embedding_dimension: list[int] | None = None, - d_model: int = 64, - encoder_ffn_dim: int = 32, - decoder_ffn_dim: int = 32, - encoder_attention_heads: int = 2, - decoder_attention_heads: int = 2, - encoder_layers: int = 2, - decoder_layers: int = 2, - is_encoder_decoder: bool = True, - activation_function: str = "gelu", - dropout: float = 0.05, - encoder_layerdrop: float = 0.1, - decoder_layerdrop: float = 0.1, - attention_dropout: float = 0.1, - activation_dropout: float = 0.1, - num_parallel_samples: int = 100, - init_std: float = 0.02, - use_cache=True, - # Informer arguments - attention_type: str = "prob", - sampling_factor: int = 5, - distil: bool = True, - **kwargs, - ): - # time series specific configuration - self.prediction_length = prediction_length - self.context_length = context_length or prediction_length - self.distribution_output = distribution_output - self.loss = loss - self.input_size = input_size - self.num_time_features = num_time_features - self.lags_sequence = lags_sequence if lags_sequence is not None else [1, 2, 3, 4, 5, 6, 7] - self.scaling = scaling - self.num_dynamic_real_features = num_dynamic_real_features - self.num_static_real_features = num_static_real_features - self.num_static_categorical_features = num_static_categorical_features - - # set cardinality - if cardinality and num_static_categorical_features > 0: - if len(cardinality) != num_static_categorical_features: - raise ValueError( - "The cardinality should be a list of the same length as `num_static_categorical_features`" - ) - self.cardinality = cardinality - else: + prediction_length: int | None = None + context_length: int | None = None + distribution_output: str = "student_t" + loss: str = "nll" + input_size: int = 1 + lags_sequence: list[int] | None = None + scaling: str | bool | None = "mean" + num_dynamic_real_features: int = 0 + num_static_real_features: int = 0 + num_static_categorical_features: int = 0 + num_time_features: int = 0 + cardinality: list[int] | None = None + embedding_dimension: list[int] | None = None + d_model: int = 64 + encoder_ffn_dim: int = 32 + decoder_ffn_dim: int = 32 + encoder_attention_heads: int = 2 + decoder_attention_heads: int = 2 + encoder_layers: int = 2 + decoder_layers: int = 2 + is_encoder_decoder: bool = True + activation_function: str = "gelu" + dropout: float | int = 0.05 + encoder_layerdrop: float | int = 0.1 + decoder_layerdrop: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + num_parallel_samples: int = 100 + init_std: float = 0.02 + use_cache: bool = True + attention_type: str = "prob" + sampling_factor: int = 5 + distil: bool = True + + def __post_init__(self, **kwargs): + self.context_length = self.context_length or self.prediction_length + self.lags_sequence = self.lags_sequence if self.lags_sequence is not None else [1, 2, 3, 4, 5, 6, 7] + + if not (self.cardinality and self.num_static_categorical_features > 0): self.cardinality = [0] - # set embedding_dimension - if embedding_dimension and num_static_categorical_features > 0: - if len(embedding_dimension) != num_static_categorical_features: - raise ValueError( - "The embedding dimension should be a list of the same length as `num_static_categorical_features`" - ) - self.embedding_dimension = embedding_dimension - else: + if not (self.embedding_dimension and self.num_static_categorical_features > 0): self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality] - self.num_parallel_samples = num_parallel_samples - - # Transformer architecture configuration - self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features - self.d_model = d_model - self.encoder_attention_heads = encoder_attention_heads - self.decoder_attention_heads = decoder_attention_heads - self.encoder_ffn_dim = encoder_ffn_dim - self.decoder_ffn_dim = decoder_ffn_dim - self.encoder_layers = encoder_layers - self.decoder_layers = decoder_layers - - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - - self.activation_function = activation_function - self.init_std = init_std - - self.use_cache = use_cache - - # Informer - self.attention_type = attention_type - self.sampling_factor = sampling_factor - self.distil = distil - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + self.feature_size = self.input_size * len(self.lags_sequence) + self._number_of_features + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if ( + self.cardinality + and self.num_static_categorical_features > 0 + and len(self.cardinality) != self.num_static_categorical_features + ): + raise ValueError( + "The cardinality should be a list of the same length as `num_static_categorical_features`" + ) + + if ( + self.embedding_dimension + and self.num_static_categorical_features > 0 + and len(self.embedding_dimension) != self.num_static_categorical_features + ): + raise ValueError( + "The embedding dimension should be a list of the same length as `num_static_categorical_features`" + ) @property def _number_of_features(self) -> int: diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py index 40e0407c4a64..ffb8beee17c3 100644 --- a/src/transformers/models/informer/modeling_informer.py +++ b/src/transformers/models/informer/modeling_informer.py @@ -412,7 +412,7 @@ def __init__( self, embed_dim: int, num_heads: int, - dropout: float = 0.0, + dropout: float | int = 0.0, is_decoder: bool = False, sampling_factor: int = 5, bias: bool = True, diff --git a/src/transformers/models/informer/modular_informer.py b/src/transformers/models/informer/modular_informer.py index 171462ec2a75..6a9d92c035b0 100644 --- a/src/transformers/models/informer/modular_informer.py +++ b/src/transformers/models/informer/modular_informer.py @@ -107,7 +107,7 @@ def __init__( self, embed_dim: int, num_heads: int, - dropout: float = 0.0, + dropout: float | int = 0.0, is_decoder: bool = False, sampling_factor: int = 5, bias: bool = True, diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py index e3e79b43df80..24de60bf462f 100644 --- a/src/transformers/models/instructblip/configuration_instructblip.py +++ b/src/transformers/models/instructblip/configuration_instructblip.py @@ -13,6 +13,8 @@ # limitations under the License. """InstructBLIP model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import auto_docstring, logging @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl") +@strict(accept_kwargs=True) class InstructBlipVisionConfig(PreTrainedConfig): r""" Example: @@ -43,37 +46,21 @@ class InstructBlipVisionConfig(PreTrainedConfig): model_type = "instructblip_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1408, - intermediate_size=6144, - num_hidden_layers=39, - num_attention_heads=16, - image_size=224, - patch_size=14, - hidden_act="gelu", - layer_norm_eps=1e-6, - attention_dropout=0.0, - initializer_range=1e-10, - qkv_bias=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.qkv_bias = qkv_bias + hidden_size: int = 1408 + intermediate_size: int = 6144 + num_hidden_layers: int = 39 + num_attention_heads: int = 16 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 14 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + initializer_range: float = 1e-10 + qkv_bias: bool = True @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl") +@strict(accept_kwargs=True) class InstructBlipQFormerConfig(PreTrainedConfig): r""" cross_attention_frequency (`int`, *optional*, defaults to 2): @@ -98,43 +85,24 @@ class InstructBlipQFormerConfig(PreTrainedConfig): model_type = "instructblip_qformer" base_config_key = "qformer_config" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - cross_attention_frequency=2, - encoder_hidden_size=1408, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.cross_attention_frequency = cross_attention_frequency - self.encoder_hidden_size = encoder_hidden_size + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + cross_attention_frequency: int = 2 + encoder_hidden_size: int = 1408 @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl") +@strict(accept_kwargs=True) class InstructBlipConfig(PreTrainedConfig): r""" qformer_config (`dict`, *optional*): @@ -182,45 +150,37 @@ class InstructBlipConfig(PreTrainedConfig): "vision_config": InstructBlipVisionConfig, } - def __init__( - self, - vision_config=None, - qformer_config=None, - text_config=None, - num_query_tokens=32, - image_token_index=None, - **kwargs, - ): - if text_config is None: - text_config = CONFIG_MAPPING["opt"]() + vision_config: dict | PreTrainedConfig | None = None + qformer_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + num_query_tokens: int = 32 + image_token_index: int | None = None + initializer_factor: float = 1.0 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = CONFIG_MAPPING["opt"]() logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") - elif isinstance(text_config, dict): - text_model_type = text_config.get("model_type", "opt") - text_config = CONFIG_MAPPING[text_model_type](**text_config) + elif isinstance(self.text_config, dict): + text_model_type = self.text_config.get("model_type", "opt") + self.text_config = CONFIG_MAPPING[text_model_type](**self.text_config) - if qformer_config is None: - qformer_config = InstructBlipQFormerConfig() + if self.qformer_config is None: + self.qformer_config = InstructBlipQFormerConfig() logger.info("qformer_config is None. Initializing the InstructBlipQFormerConfig with default values.") - elif isinstance(qformer_config, dict): - qformer_config = InstructBlipQFormerConfig(**qformer_config) + elif isinstance(self.qformer_config, dict): + self.qformer_config = InstructBlipQFormerConfig(**self.qformer_config) - if vision_config is None: - vision_config = InstructBlipVisionConfig() + if self.vision_config is None: + self.vision_config = InstructBlipVisionConfig() logger.info("`vision_config` is `None`. initializing the `InstructBlipVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = InstructBlipVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - self.qformer_config = qformer_config + elif isinstance(self.vision_config, dict): + self.vision_config = InstructBlipVisionConfig(**self.vision_config) - self.num_query_tokens = num_query_tokens - self.image_token_index = image_token_index self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES - self.initializer_factor = 1.0 - self.initializer_range = 0.02 - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["InstructBlipConfig", "InstructBlipQFormerConfig", "InstructBlipVisionConfig"] diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py index 1e8ec6756900..bac056d6aff1 100644 --- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -19,6 +19,8 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import auto_docstring, logging @@ -29,6 +31,7 @@ @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl") +@strict(accept_kwargs=True) class InstructBlipVideoVisionConfig(PreTrainedConfig): r""" Example: @@ -49,37 +52,21 @@ class InstructBlipVideoVisionConfig(PreTrainedConfig): model_type = "instructblipvideo_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1408, - intermediate_size=6144, - num_hidden_layers=39, - num_attention_heads=16, - image_size=224, - patch_size=14, - hidden_act="gelu", - layer_norm_eps=1e-6, - attention_dropout=0.0, - initializer_range=1e-10, - qkv_bias=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.qkv_bias = qkv_bias + hidden_size: int = 1408 + intermediate_size: int = 6144 + num_hidden_layers: int = 39 + num_attention_heads: int = 16 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 14 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + initializer_range: float = 1e-10 + qkv_bias: bool = True @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl") +@strict(accept_kwargs=True) class InstructBlipVideoQFormerConfig(PreTrainedConfig): r""" cross_attention_frequency (`int`, *optional*, defaults to 2): @@ -104,43 +91,24 @@ class InstructBlipVideoQFormerConfig(PreTrainedConfig): model_type = "instructblipvideo_qformer" base_config_key = "qformer_config" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - cross_attention_frequency=2, - encoder_hidden_size=1408, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.cross_attention_frequency = cross_attention_frequency - self.encoder_hidden_size = encoder_hidden_size + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + cross_attention_frequency: int = 2 + encoder_hidden_size: int = 1408 @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl") +@strict(accept_kwargs=True) class InstructBlipVideoConfig(PreTrainedConfig): r""" qformer_config (`dict`, *optional*): @@ -179,56 +147,47 @@ class InstructBlipVideoConfig(PreTrainedConfig): ```""" model_type = "instructblipvideo" - attribute_map = { - "video_token_id": "video_token_index", - } + + attribute_map = {"video_token_id": "video_token_index"} sub_configs = { "text_config": AutoConfig, "qformer_config": InstructBlipVideoQFormerConfig, "vision_config": InstructBlipVideoVisionConfig, } - def __init__( - self, - vision_config=None, - qformer_config=None, - text_config=None, - num_query_tokens=32, - video_token_index=None, - **kwargs, - ): - if text_config is None: - text_config = CONFIG_MAPPING["opt"]() + vision_config: dict | PreTrainedConfig | None = None + qformer_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + num_query_tokens: int = 32 + initializer_factor: float = 1.0 + initializer_range: float = 0.02 + video_token_index: int | None = None + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = CONFIG_MAPPING["opt"]() logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") - elif isinstance(text_config, dict): - text_model_type = text_config.get("model_type", "opt") - text_config = CONFIG_MAPPING[text_model_type](**text_config) + elif isinstance(self.text_config, dict): + text_model_type = self.text_config.get("model_type", "opt") + self.text_config = CONFIG_MAPPING[text_model_type](**self.text_config) - if qformer_config is None: - qformer_config = InstructBlipVideoQFormerConfig() + if self.qformer_config is None: + self.qformer_config = InstructBlipVideoQFormerConfig() logger.info("qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.") - elif isinstance(qformer_config, dict): - qformer_config = InstructBlipVideoQFormerConfig(**qformer_config) + elif isinstance(self.qformer_config, dict): + self.qformer_config = InstructBlipVideoQFormerConfig(**self.qformer_config) - if vision_config is None: - vision_config = InstructBlipVideoVisionConfig() + if self.vision_config is None: + self.vision_config = InstructBlipVideoVisionConfig() logger.info( "`vision_config` is `None`. initializing the `InstructBlipVideoVisionConfig` with default values." ) - elif isinstance(vision_config, dict): - vision_config = InstructBlipVideoVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - self.qformer_config = qformer_config + elif isinstance(self.vision_config, dict): + self.vision_config = InstructBlipVideoVisionConfig(**self.vision_config) - self.num_query_tokens = num_query_tokens - self.video_token_index = video_token_index self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES - self.initializer_factor = 1.0 - self.initializer_range = 0.02 - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["InstructBlipVideoConfig", "InstructBlipVideoQFormerConfig", "InstructBlipVideoVisionConfig"] diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 84e18dbe3545..2938cd3f45eb 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -14,8 +14,10 @@ import torch +from huggingface_hub.dataclasses import strict from transformers.models.instructblip.configuration_instructblip import ( + InstructBlipConfig, InstructBlipQFormerConfig, InstructBlipVisionConfig, ) @@ -30,18 +32,13 @@ TransformersKwargs, ) -from ...configuration_utils import PreTrainedConfig from ...modeling_outputs import BaseModelOutputWithPooling -from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...processing_utils import Unpack -from ...utils import auto_docstring, can_return_tuple, logging -from ..auto import CONFIG_MAPPING, AutoConfig - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring, can_return_tuple @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl") +@strict(accept_kwargs=True) class InstructBlipVideoVisionConfig(InstructBlipVisionConfig): r""" Example: @@ -61,6 +58,7 @@ class InstructBlipVideoVisionConfig(InstructBlipVisionConfig): @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl") +@strict(accept_kwargs=True) class InstructBlipVideoQFormerConfig(InstructBlipQFormerConfig): r""" cross_attention_frequency (`int`, *optional*, defaults to 2): @@ -84,7 +82,8 @@ class InstructBlipVideoQFormerConfig(InstructBlipQFormerConfig): @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl") -class InstructBlipVideoConfig(PreTrainedConfig): +@strict(accept_kwargs=True) +class InstructBlipVideoConfig(InstructBlipConfig): r""" qformer_config (`dict`, *optional*): Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`]. @@ -121,57 +120,9 @@ class InstructBlipVideoConfig(PreTrainedConfig): >>> config = InstructBlipVideoConfig(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config) ```""" - model_type = "instructblipvideo" - attribute_map = { - "video_token_id": "video_token_index", - } - sub_configs = { - "text_config": AutoConfig, - "qformer_config": InstructBlipVideoQFormerConfig, - "vision_config": InstructBlipVideoVisionConfig, - } - - def __init__( - self, - vision_config=None, - qformer_config=None, - text_config=None, - num_query_tokens=32, - video_token_index=None, - **kwargs, - ): - if text_config is None: - text_config = CONFIG_MAPPING["opt"]() - logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") - elif isinstance(text_config, dict): - text_model_type = text_config.get("model_type", "opt") - text_config = CONFIG_MAPPING[text_model_type](**text_config) - - if qformer_config is None: - qformer_config = InstructBlipVideoQFormerConfig() - logger.info("qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.") - elif isinstance(qformer_config, dict): - qformer_config = InstructBlipVideoQFormerConfig(**qformer_config) - - if vision_config is None: - vision_config = InstructBlipVideoVisionConfig() - logger.info( - "`vision_config` is `None`. initializing the `InstructBlipVideoVisionConfig` with default values." - ) - elif isinstance(vision_config, dict): - vision_config = InstructBlipVideoVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - self.qformer_config = qformer_config - - self.num_query_tokens = num_query_tokens - self.video_token_index = video_token_index - self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size - self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES - self.initializer_factor = 1.0 - self.initializer_range = 0.02 - super().__init__(**kwargs) + attribute_map = {"video_token_id": "video_token_index"} + video_token_index: int | None = None + image_token_index = AttributeError() class InstructBlipVideoPreTrainedModel(InstructBlipPreTrainedModel): diff --git a/src/transformers/models/internvl/configuration_internvl.py b/src/transformers/models/internvl/configuration_internvl.py index 3fdc16b22fb5..1dec856206a1 100644 --- a/src/transformers/models/internvl/configuration_internvl.py +++ b/src/transformers/models/internvl/configuration_internvl.py @@ -13,12 +13,15 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="OpenGVLab/InternVL3-1B-hf") +@strict(accept_kwargs=True) class InternVLVisionConfig(PreTrainedConfig): r""" projection_dropout (`float`, *optional*, defaults to 0.0): @@ -49,59 +52,39 @@ class InternVLVisionConfig(PreTrainedConfig): model_type = "internvl_vision" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - attention_bias=False, - use_qk_norm=False, - intermediate_size=4096, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_dropout=0.0, - projection_dropout=0.0, - initializer_range=0.02, - norm_type="layer_norm", - layer_norm_eps=1e-06, - image_size=[448, 448], - patch_size=[14, 14], - num_channels=3, - use_mask_token=False, - use_absolute_position_embeddings=True, - layer_scale_init_value=0.1, - use_mean_pooling=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.attention_bias = attention_bias - self.use_qk_norm = use_qk_norm - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_dropout = attention_dropout - self.projection_dropout = projection_dropout - self.initializer_range = initializer_range - self.norm_type = norm_type - self.layer_norm_eps = layer_norm_eps - - image_size = image_size if isinstance(image_size, (list, tuple)) else (image_size, image_size) - patch_size = patch_size if isinstance(patch_size, (list, tuple)) else (patch_size, patch_size) - self.image_size = image_size - self.patch_size = patch_size - - self.num_channels = num_channels - self.use_mask_token = use_mask_token - self.use_absolute_position_embeddings = use_absolute_position_embeddings - self.layer_scale_init_value = layer_scale_init_value - self.use_mean_pooling = use_mean_pooling + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + attention_bias: bool = False + use_qk_norm: bool = False + intermediate_size: int = 4096 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_dropout: float | int = 0.0 + projection_dropout: float | int = 0.0 + initializer_range: float = 0.02 + norm_type: str = "layer_norm" + layer_norm_eps: float = 1e-06 + image_size: int | list[int] | tuple[int, ...] = (448, 448) + patch_size: int | list[int] | tuple[int, ...] = (14, 14) + num_channels: int = 3 + use_mask_token: bool = False + use_absolute_position_embeddings: bool = True + layer_scale_init_value: float = 0.1 + use_mean_pooling: bool = True + + def __post_init__(self, **kwargs): + self.image_size = ( + self.image_size if isinstance(self.image_size, (list, tuple)) else (self.image_size, self.image_size) + ) + self.patch_size = ( + self.patch_size if isinstance(self.patch_size, (list, tuple)) else (self.patch_size, self.patch_size) + ) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="OpenGVLab/InternVL3-1B-hf") +@strict(accept_kwargs=True) class InternVLConfig(PreTrainedConfig): r""" downsample_ratio (`float`, *optional*, defaults to 0.5): @@ -125,43 +108,29 @@ class InternVLConfig(PreTrainedConfig): model_type = "internvl" sub_configs = {"text_config": AutoConfig, "vision_config": InternVLVisionConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_id=151667, - image_seq_length=256, - downsample_ratio=0.5, - projector_hidden_act="gelu", - vision_feature_layer=-1, - vision_feature_select_strategy="default", - tie_word_embeddings=True, - **kwargs, - ): - self.image_token_id = image_token_id - self.image_seq_length = image_seq_length - self.downsample_ratio = downsample_ratio - self.projector_hidden_act = projector_hidden_act - self.vision_feature_layer = vision_feature_layer - self.vision_feature_select_strategy = vision_feature_select_strategy - - if isinstance(vision_config, dict): - self.vision_config = InternVLVisionConfig(**vision_config) - elif isinstance(vision_config, InternVLVisionConfig): - self.vision_config = vision_config - elif vision_config is None: + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151667 + image_seq_length: int = 256 + downsample_ratio: float = 0.5 + projector_hidden_act: str = "gelu" + vision_feature_layer: int | list[int] = -1 + vision_feature_select_strategy: str = "default" + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = InternVLVisionConfig(**self.vision_config) + elif self.vision_config is None: self.vision_config = InternVLVisionConfig() - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "qwen2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["qwen2"]() - - self.text_config = text_config - self.tie_word_embeddings = tie_word_embeddings + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["qwen2"]() - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["InternVLVisionConfig", "InternVLConfig"] diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py index 68a49f548793..f360df67cb1c 100644 --- a/src/transformers/models/internvl/modeling_internvl.py +++ b/src/transformers/models/internvl/modeling_internvl.py @@ -70,7 +70,7 @@ def eager_attention_forward( value: torch.Tensor, attention_mask: torch.Tensor | None, scaling: float, - dropout: float = 0.0, + dropout: float | int = 0.0, **kwargs, ): key_states = key @@ -553,7 +553,7 @@ def set_input_embeddings(self, value): def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -630,7 +630,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | InternVLModelOutputWithPast: @@ -768,7 +768,7 @@ def get_output_embeddings(self) -> nn.Module: def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -789,7 +789,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, labels: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, diff --git a/src/transformers/models/internvl/modular_internvl.py b/src/transformers/models/internvl/modular_internvl.py index 80deb0381f67..ab95fbfff39e 100644 --- a/src/transformers/models/internvl/modular_internvl.py +++ b/src/transformers/models/internvl/modular_internvl.py @@ -50,7 +50,7 @@ def eager_attention_forward( value: torch.Tensor, attention_mask: torch.Tensor | None, scaling: float, - dropout: float = 0.0, + dropout: float | int = 0.0, **kwargs, ): key_states = key @@ -493,7 +493,7 @@ def pixel_shuffle(self, vision_features: torch.Tensor, scale_factor: float = 0.5 def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -546,7 +546,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | InternVLModelOutputWithPast: diff --git a/src/transformers/models/jais2/configuration_jais2.py b/src/transformers/models/jais2/configuration_jais2.py index 3ae40618b6de..3d69078d491d 100644 --- a/src/transformers/models/jais2/configuration_jais2.py +++ b/src/transformers/models/jais2/configuration_jais2.py @@ -19,12 +19,16 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring +from ...utils.type_validators import interval @auto_docstring(checkpoint="inceptionai/Jais-2-8B-Chat") +@strict(accept_kwargs=True) class Jais2Config(PreTrainedConfig): r""" ```python @@ -42,7 +46,6 @@ class Jais2Config(PreTrainedConfig): model_type = "jais2" keys_to_ignore_at_inference = ["past_key_values"] - base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -57,57 +60,42 @@ class Jais2Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 150272, - hidden_size: int | None = 3328, - intermediate_size: int | None = 26624, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 26, - num_key_value_heads: int | None = None, - hidden_act: str | None = "relu2", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - layer_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 0, - eos_token_id: int | None = 150024, - tie_word_embeddings: bool | None = False, - attention_bias: bool | None = True, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = True, - head_dim: int | None = None, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads + vocab_size: int = 150272 + hidden_size: int = 3328 + intermediate_size: int = 26624 + num_hidden_layers: int = 32 + num_attention_heads: int = 26 + num_key_value_heads: int | None = None + hidden_act: str = "relu2" + max_position_embeddings: int = 8192 + initializer_range: float = interval(min=0.0, max=1.0)(default=0.02) + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 150024 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = True + attention_dropout: int | float | None = 0.0 + mlp_bias: bool = True + head_dim: int | None = None + layer_norm_eps: float = 1e-5 - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + def __post_init__(self, **kwargs): + if self.head_dim is None: + self.head_dim = self.hidden_size // self.num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters + super().__post_init__(**kwargs) - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) - self.layer_norm_eps = layer_norm_eps + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) __all__ = ["Jais2Config"] diff --git a/src/transformers/models/jais2/modular_jais2.py b/src/transformers/models/jais2/modular_jais2.py index c99b718a544f..6d65550fbab9 100644 --- a/src/transformers/models/jais2/modular_jais2.py +++ b/src/transformers/models/jais2/modular_jais2.py @@ -14,8 +14,8 @@ import torch.nn as nn +from huggingface_hub.dataclasses import strict -from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, can_return_tuple from ..llama.configuration_llama import LlamaConfig from ..llama.modeling_llama import ( @@ -28,9 +28,8 @@ @auto_docstring(checkpoint="inceptionai/Jais-2-8B-Chat") +@strict(accept_kwargs=True) class Jais2Config(LlamaConfig): - model_type = "jais2" - base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -40,55 +39,19 @@ class Jais2Config(LlamaConfig): "layers.*.mlp.down_proj": "rowwise", } - def __init__( - self, - vocab_size: int | None = 150272, - hidden_size: int | None = 3328, - intermediate_size: int | None = 26624, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 26, - num_key_value_heads: int | None = None, - hidden_act: str | None = "relu2", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - layer_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 0, - eos_token_id: int | None = 150024, - tie_word_embeddings: bool | None = False, - attention_bias: bool | None = True, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = True, - head_dim: int | None = None, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - use_cache=use_cache, - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - mlp_bias=mlp_bias, - head_dim=head_dim, - rope_parameters=rope_parameters, - **kwargs, - ) - self.layer_norm_eps = layer_norm_eps - del self.rms_norm_eps - del self.pretraining_tp + vocab_size: int = 150272 + hidden_size: int = 3328 + intermediate_size: int = 26624 + num_attention_heads: int = 26 + hidden_act: str = "relu2" + max_position_embeddings: int = 8192 + layer_norm_eps: float = 1e-5 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 150024 + attention_bias: bool = True + mlp_bias: bool = True + rms_norm_eps = AttributeError() + pretraining_tp = AttributeError() class Jais2MLP(NemotronMLP): diff --git a/src/transformers/models/jamba/configuration_jamba.py b/src/transformers/models/jamba/configuration_jamba.py index 2ea28bfed57a..a5dfb6fcd03c 100644 --- a/src/transformers/models/jamba/configuration_jamba.py +++ b/src/transformers/models/jamba/configuration_jamba.py @@ -15,14 +15,14 @@ import math -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="ai21labs/Jamba-v0.1") +@strict(accept_kwargs=True) class JambaConfig(PreTrainedConfig): r""" expert_layer_period (`int`, *optional*, defaults to 2): @@ -47,86 +47,44 @@ class JambaConfig(PreTrainedConfig): "num_local_experts": "num_experts", } - def __init__( - self, - vocab_size=65536, - tie_word_embeddings=False, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - hidden_act="silu", - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - output_router_logits=False, - router_aux_loss_coef=0.001, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - max_position_embeddings=262144, - attention_dropout=0.0, - num_experts_per_tok=2, - num_experts=16, - expert_layer_period=2, - expert_layer_offset=1, - attn_layer_period=8, - attn_layer_offset=4, - use_mamba_kernels=True, - mamba_d_state=16, - mamba_d_conv=4, - mamba_expand=2, - mamba_dt_rank="auto", - mamba_conv_bias=True, - mamba_proj_bias=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.tie_word_embeddings = tie_word_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.attention_dropout = attention_dropout - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - - self.use_cache = use_cache - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.expert_layer_period = expert_layer_period - self.expert_layer_offset = expert_layer_offset - self.attn_layer_period = attn_layer_period - self.attn_layer_offset = attn_layer_offset - - self._check_supported_offset("attention", self.attn_layer_period, self.attn_layer_offset) - self._check_supported_offset("expert", self.expert_layer_period, self.expert_layer_offset) - - self.use_mamba_kernels = use_mamba_kernels - self.mamba_d_state = mamba_d_state - self.mamba_d_conv = mamba_d_conv - self.mamba_expand = mamba_expand - self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank - self.mamba_conv_bias = mamba_conv_bias - self.mamba_proj_bias = mamba_proj_bias - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 65536 + tie_word_embeddings: bool = False + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + hidden_act: str = "silu" + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + max_position_embeddings: int = 262144 + attention_dropout: float | int = 0.0 + num_experts_per_tok: int = 2 + num_experts: int = 16 + expert_layer_period: int = 2 + expert_layer_offset: int = 1 + attn_layer_period: int = 8 + attn_layer_offset: int = 4 + use_mamba_kernels: bool = True + mamba_d_state: int = 16 + mamba_d_conv: int = 4 + mamba_expand: int = 2 + mamba_dt_rank: int | str = "auto" + mamba_conv_bias: bool = True + mamba_proj_bias: bool = False + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if self.mamba_dt_rank == "auto" else self.mamba_dt_rank + super().__post_init__(**kwargs) @property def layers_block_type(self): @@ -142,10 +100,16 @@ def layers_num_experts(self): for i in range(self.num_hidden_layers) ] - def _check_supported_offset(self, property_: str, period: int, offset: int): - if offset >= period: + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.attn_layer_offset >= self.attn_layer_period: + raise ValueError( + f"attention layer offset ({self.attn_layer_offset}) must be smaller than attention layer period ({self.attn_layer_period})" + ) + + if self.expert_layer_offset >= self.expert_layer_period: raise ValueError( - f"{property_} layer offset ({offset}) must be smaller than {property_} layer period ({period})" + f"expert layer offset ({self.expert_layer_offset}) must be smaller than expert layer period ({self.expert_layer_period})" ) diff --git a/src/transformers/models/janus/configuration_janus.py b/src/transformers/models/janus/configuration_janus.py index 64e4ad4d796d..dc5883f45790 100644 --- a/src/transformers/models/janus/configuration_janus.py +++ b/src/transformers/models/janus/configuration_janus.py @@ -18,6 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -27,6 +29,7 @@ @auto_docstring(checkpoint="deepseek-community/Janus-Pro-1B") +@strict(accept_kwargs=True) class JanusVisionConfig(PreTrainedConfig): r""" num_image_tokens (`int`, *optional*, defaults to 576): @@ -38,52 +41,28 @@ class JanusVisionConfig(PreTrainedConfig): model_type = "janus_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - num_channels=3, - patch_size=16, - image_size=384, - attention_dropout=0.0, - layer_norm_eps=1e-6, - hidden_act="gelu", - mlp_ratio=4.0, - attention_bias=True, - hidden_dropout_rate=0.0, - projection_dim=2048, - projection_dropout=0.0, - use_qk_norm=False, - initializer_range=0.02, - depth=2, - num_image_tokens=576, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - - self.mlp_ratio = mlp_ratio - self.attention_bias = attention_bias - self.hidden_dropout_rate = hidden_dropout_rate - self.projection_dim = projection_dim - self.projection_dropout = projection_dropout - self.use_qk_norm = use_qk_norm - self.initializer_range = initializer_range - self.depth = depth - self.num_image_tokens = num_image_tokens + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 384 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + mlp_ratio: float | int = 4.0 + attention_bias: bool = True + hidden_dropout_rate: float = 0.0 + projection_dim: int = 2048 + projection_dropout: float | int = 0.0 + use_qk_norm: bool = False + initializer_range: float = 0.02 + depth: int = 2 + num_image_tokens: int = 576 @auto_docstring(checkpoint="deepseek-community/Janus-Pro-1B") +@strict(accept_kwargs=True) class JanusVQVAEConfig(PreTrainedConfig): r""" image_token_embed_dim (`int`, *optional*, defaults to 2048): @@ -103,46 +82,26 @@ class JanusVQVAEConfig(PreTrainedConfig): model_type = "janus_vqgan" base_config_key = "vq_config" - def __init__( - self, - embed_dim: int = 8, - num_embeddings: int = 16384, - double_latent: bool = False, - latent_channels: int = 256, - num_patches: int = 32, - in_channels: int = 3, - out_channels: int = 3, - base_channels: int = 128, - channel_multiplier: list[int] = [1, 1, 2, 2, 4], - num_res_blocks: int = 2, - dropout: float = 0.0, - initializer_range=0.02, - projection_dim=2048, - num_hidden_layers=2, - hidden_act="gelu", - image_token_embed_dim=2048, - **kwargs, - ): - super().__init__(**kwargs) - self.embed_dim = embed_dim - self.num_embeddings = num_embeddings - self.double_latent = double_latent - self.latent_channels = latent_channels - self.in_channels = in_channels - self.base_channels = base_channels - self.channel_multiplier = channel_multiplier - self.num_res_blocks = num_res_blocks - self.dropout = dropout - self.initializer_range = initializer_range - self.num_patches = num_patches - self.out_channels = out_channels - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.hidden_act = hidden_act - self.image_token_embed_dim = image_token_embed_dim + embed_dim: int = 8 + num_embeddings: int = 16384 + double_latent: bool = False + latent_channels: int = 256 + in_channels: int = 3 + base_channels: int = 128 + channel_multiplier: list[int] | tuple[int, ...] = (1, 1, 2, 2, 4) + num_res_blocks: int = 2 + dropout: float | int = 0.0 + initializer_range: float = 0.02 + num_patches: int = 32 + out_channels: int = 3 + projection_dim: int = 2048 + num_hidden_layers: int = 2 + hidden_act: str = "gelu" + image_token_embed_dim = 2048 @auto_docstring(checkpoint="deepseek-community/Janus-Pro-1B") +@strict(accept_kwargs=True) class JanusConfig(PreTrainedConfig): r""" Example: @@ -176,61 +135,34 @@ class JanusConfig(PreTrainedConfig): "vq_config": JanusVQVAEConfig, } - def __init__( - self, - text_config=None, - vision_config=None, - vq_config=None, - image_token_id=100581, - **kwargs, - ): - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - - elif text_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + vq_config: dict | PreTrainedConfig | None = None + image_token_id: int = 100581 + + def __post_init__(self, **kwargs): + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: logger.info("`text_config` is None. Initializing with default values") self.text_config = CONFIG_MAPPING["llama"]() - elif isinstance(text_config, PreTrainedConfig): - self.text_config = text_config - else: - raise ValueError( - f"Invalid type for `text_config`. Must be either `dict` or `LlamaConfig`." - f" Type found: {type(text_config)}" - ) - - if vision_config is None: + + if self.vision_config is None: logger.info("`vision_config` is None. Initializing with default JanusVisionConfig values") self.vision_config = JanusVisionConfig() - elif isinstance(vision_config, dict): - self.vision_config = JanusVisionConfig(**vision_config) - elif isinstance(vision_config, JanusVisionConfig): - self.vision_config = vision_config - else: - raise ValueError( - f"Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`." - f" Type found: {type(vision_config)}" - ) - - if vq_config is None: + elif isinstance(self.vision_config, dict): + self.vision_config = JanusVisionConfig(**self.vision_config) + + if self.vq_config is None: logger.info("`vq_config` is None. Initializing with default JanusVQVAEConfig values") self.vq_config = JanusVQVAEConfig() - elif isinstance(vq_config, dict): - self.vq_config = JanusVQVAEConfig(**vq_config) - elif isinstance(vq_config, JanusVQVAEConfig): - self.vq_config = vq_config - else: - raise ValueError( - f"Invalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`." - f" Type found: {type(vq_config)}" - ) - - self.initializer_range = self.vision_config.initializer_range + elif isinstance(self.vq_config, dict): + self.vq_config = JanusVQVAEConfig(**self.vq_config) + # This dimension is required when decoding discrete image tokens to continuous input. self.vq_config.num_patches = self.vision_config.image_size // self.vision_config.patch_size - # The default is only the index for the 1B model, 7B uses a different one - self.image_token_id = image_token_id - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["JanusVQVAEConfig", "JanusVisionConfig", "JanusConfig"] diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index f4eb386058af..3ac14da36ea1 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -19,6 +19,7 @@ import numpy as np import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -80,6 +81,7 @@ @auto_docstring(checkpoint="deepseek-community/Janus-Pro-1B") +@strict(accept_kwargs=True) class JanusVisionConfig(SiglipVisionConfig): r""" num_image_tokens (`int`, *optional*, defaults to 576): @@ -88,57 +90,25 @@ class JanusVisionConfig(SiglipVisionConfig): Dropout probability for the projection layer. """ - model_type = "janus_vision_model" - base_config_key = "vision_config" - - def __init__( - self, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - num_channels=3, - patch_size=16, - image_size=384, - attention_dropout=0.0, - layer_norm_eps=1e-6, - hidden_act="gelu", - mlp_ratio=4.0, - attention_bias=True, - hidden_dropout_rate=0.0, - projection_dim=2048, - projection_dropout=0.0, - use_qk_norm=False, - initializer_range=0.02, - depth=2, - num_image_tokens=576, - **kwargs, - ): - super().__init__( - hidden_size=hidden_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_channels=num_channels, - patch_size=patch_size, - image_size=image_size, - attention_dropout=attention_dropout, - layer_norm_eps=layer_norm_eps, - hidden_act=hidden_act, - **kwargs, - ) - del self.intermediate_size - - self.mlp_ratio = mlp_ratio - self.attention_bias = attention_bias - self.hidden_dropout_rate = hidden_dropout_rate - self.projection_dim = projection_dim - self.projection_dropout = projection_dropout - self.use_qk_norm = use_qk_norm - self.initializer_range = initializer_range - self.depth = depth - self.num_image_tokens = num_image_tokens + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + image_size: int | list[int] | tuple[int, int] = 384 + hidden_act: str = "gelu" + mlp_ratio: float | int = 4.0 + attention_bias: bool = True + hidden_dropout_rate: float = 0.0 + projection_dim: int = 2048 + projection_dropout: float | int = 0.0 + use_qk_norm: bool = False + initializer_range: float = 0.02 + depth: int = 2 + num_image_tokens: int = 576 + intermediate_size = AttributeError() @auto_docstring(checkpoint="deepseek-community/Janus-Pro-1B") +@strict(accept_kwargs=True) class JanusVQVAEConfig(ChameleonVQVAEConfig): r""" image_token_embed_dim (`int`, *optional*, defaults to 2048): @@ -155,52 +125,30 @@ class JanusVQVAEConfig(ChameleonVQVAEConfig): Number of residual blocks. """ - def __init__( - self, - embed_dim: int = 8, - num_embeddings: int = 16384, - double_latent: bool = False, - latent_channels: int = 256, - num_patches: int = 32, - in_channels: int = 3, - out_channels: int = 3, - base_channels: int = 128, - channel_multiplier: list[int] = [1, 1, 2, 2, 4], - num_res_blocks: int = 2, - dropout: float = 0.0, - initializer_range=0.02, - projection_dim=2048, - num_hidden_layers=2, - hidden_act="gelu", - image_token_embed_dim=2048, - **kwargs, - ): - super().__init__( - embed_dim=embed_dim, - num_embeddings=num_embeddings, - double_latent=double_latent, - latent_channels=latent_channels, - in_channels=in_channels, - base_channels=base_channels, - channel_multiplier=channel_multiplier, - num_res_blocks=num_res_blocks, - dropout=dropout, - initializer_range=initializer_range, - **kwargs, - ) - self.num_patches = num_patches - self.out_channels = out_channels - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.hidden_act = hidden_act - self.image_token_embed_dim = image_token_embed_dim - - del self.resolution - del self.attn_resolutions - del self.attn_type + embed_dim: int = 8 + num_embeddings: int = 16384 + double_latent: bool = False + latent_channels: int = 256 + num_patches: int = 32 + in_channels: int = 3 + out_channels: int = 3 + base_channels: int = 128 + channel_multiplier: list[int] | tuple[int, ...] = (1, 1, 2, 2, 4) + num_res_blocks: int = 2 + dropout: float | int = 0.0 + initializer_range: float = 0.02 + projection_dim: int = 2048 + num_hidden_layers: int = 2 + hidden_act: str = "gelu" + image_token_embed_dim = 2048 + + resolution = AttributeError() + attn_resolutions = AttributeError() + attn_type = AttributeError() @auto_docstring(checkpoint="deepseek-community/Janus-Pro-1B") +@strict(accept_kwargs=True) class JanusConfig(PreTrainedConfig): r""" Example: @@ -234,61 +182,34 @@ class JanusConfig(PreTrainedConfig): "vq_config": JanusVQVAEConfig, } - def __init__( - self, - text_config=None, - vision_config=None, - vq_config=None, - image_token_id=100581, - **kwargs, - ): - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + vq_config: dict | PreTrainedConfig | None = None + image_token_id: int = 100581 - elif text_config is None: + def __post_init__(self, **kwargs): + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: logger.info("`text_config` is None. Initializing with default values") self.text_config = CONFIG_MAPPING["llama"]() - elif isinstance(text_config, PreTrainedConfig): - self.text_config = text_config - else: - raise ValueError( - f"Invalid type for `text_config`. Must be either `dict` or `LlamaConfig`." - f" Type found: {type(text_config)}" - ) - if vision_config is None: + if self.vision_config is None: logger.info("`vision_config` is None. Initializing with default JanusVisionConfig values") self.vision_config = JanusVisionConfig() - elif isinstance(vision_config, dict): - self.vision_config = JanusVisionConfig(**vision_config) - elif isinstance(vision_config, JanusVisionConfig): - self.vision_config = vision_config - else: - raise ValueError( - f"Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`." - f" Type found: {type(vision_config)}" - ) + elif isinstance(self.vision_config, dict): + self.vision_config = JanusVisionConfig(**self.vision_config) - if vq_config is None: + if self.vq_config is None: logger.info("`vq_config` is None. Initializing with default JanusVQVAEConfig values") self.vq_config = JanusVQVAEConfig() - elif isinstance(vq_config, dict): - self.vq_config = JanusVQVAEConfig(**vq_config) - elif isinstance(vq_config, JanusVQVAEConfig): - self.vq_config = vq_config - else: - raise ValueError( - f"Invalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`." - f" Type found: {type(vq_config)}" - ) + elif isinstance(self.vq_config, dict): + self.vq_config = JanusVQVAEConfig(**self.vq_config) - self.initializer_range = self.vision_config.initializer_range # This dimension is required when decoding discrete image tokens to continuous input. self.vq_config.num_patches = self.vision_config.image_size // self.vision_config.patch_size - # The default is only the index for the 1B model, 7B uses a different one - self.image_token_id = image_token_id - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index 11949f27b2e1..44b7f236f9f9 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -13,15 +13,15 @@ # limitations under the License. """JetMoe model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="jetmoe/jetmoe-8b") +@strict(accept_kwargs=True) class JetMoeConfig(PreTrainedConfig): r""" kv_channels (`int`, *optional*, defaults to 128): @@ -46,57 +46,36 @@ class JetMoeConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] attribute_map = {"head_dim": "kv_channels"} - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 2048, - num_hidden_layers: int | None = 12, - num_key_value_heads: int | None = 16, - kv_channels: int | None = 128, - intermediate_size: int | None = 5632, - max_position_embeddings: int | None = 4096, - activation_function: str | None = "silu", - num_local_experts: int | None = 8, - num_experts_per_tok: int | None = 2, - output_router_logits: bool | None = False, - aux_loss_coef: float | None = 0.01, - use_cache: bool | None = True, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - pad_token_id: int | None = None, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - rms_norm_eps: int | None = 1e-6, - initializer_range: float | None = 0.01, - attention_dropout: float | None = 0.0, - **kwargs, - ): - if num_experts_per_tok > num_local_experts: - raise ValueError("`num_experts_per_tok` must be less than or equal to `num_local_experts`") - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_key_value_heads * num_experts_per_tok - self.num_key_value_heads = num_key_value_heads - self.kv_channels = kv_channels - self.intermediate_size = intermediate_size - self.max_position_embeddings = max_position_embeddings - self.activation_function = activation_function - self.num_local_experts = num_local_experts - self.num_experts_per_tok = num_experts_per_tok - self.output_router_logits = output_router_logits - self.aux_loss_coef = aux_loss_coef - self.use_cache = use_cache - self.initializer_range = initializer_range - self.attention_dropout = attention_dropout + vocab_size: int = 32000 + hidden_size: int = 2048 + num_hidden_layers: int = 12 + num_key_value_heads: int = 16 + kv_channels: int = 128 + intermediate_size: int = 5632 + max_position_embeddings: int = 4096 + activation_function: str = "silu" + num_local_experts: int = 8 + num_experts_per_tok: int = 2 + output_router_logits: bool = False + aux_loss_coef: float = 0.01 + use_cache: bool = True + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + pad_token_id: int | None = None + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + rms_norm_eps: float = 1e-6 + initializer_range: float = 0.01 + attention_dropout: float | int = 0.0 + + def __post_init__(self, **kwargs): + self.num_attention_heads = self.num_key_value_heads * self.num_experts_per_tok + super().__post_init__(**kwargs) - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.rms_norm_eps = rms_norm_eps - self.rope_parameters = rope_parameters - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.num_experts_per_tok > self.num_local_experts: + raise ValueError("`num_experts_per_tok` must be less than or equal to `num_local_experts`") __all__ = ["JetMoeConfig"] diff --git a/src/transformers/models/kosmos2/configuration_kosmos2.py b/src/transformers/models/kosmos2/configuration_kosmos2.py index bd96b9d8ce58..8485ac91e603 100644 --- a/src/transformers/models/kosmos2/configuration_kosmos2.py +++ b/src/transformers/models/kosmos2/configuration_kosmos2.py @@ -13,6 +13,8 @@ # limitations under the License. """KOSMOS-2 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="microsoft/kosmos-2-patch14-224") +@strict(accept_kwargs=True) class Kosmos2TextConfig(PreTrainedConfig): r""" activation_dropout (`float`, *optional*, defaults to 0.0): @@ -36,90 +39,49 @@ class Kosmos2TextConfig(PreTrainedConfig): "num_hidden_layers": "layers", } - def __init__( - self, - vocab_size=65037, - max_position_embeddings=2048, - embed_dim=2048, - layers=24, - ffn_dim=8192, - attention_heads=32, - activation_function="gelu", - dropout=0.1, - attention_dropout=0.1, - activation_dropout=0.0, - layerdrop=0.0, - layer_norm_eps=1e-5, - init_std=0.02, - scale_embedding=True, - use_cache=True, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - add_cross_attention=False, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.add_cross_attention = add_cross_attention - - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.embed_dim = embed_dim - self.layers = layers - self.ffn_dim = ffn_dim - self.attention_heads = attention_heads - self.activation_function = activation_function - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.layerdrop = layerdrop - self.layer_norm_eps = layer_norm_eps - self.init_std = init_std - self.scale_embedding = scale_embedding - self.use_cache = use_cache + vocab_size: int = 65037 + max_position_embeddings: int = 2048 + embed_dim: int = 2048 + layers: int = 24 + ffn_dim: int = 8192 + attention_heads: int = 32 + activation_function: str = "gelu" + dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.0 + layerdrop: float | int = 0.0 + layer_norm_eps: float = 1e-5 + init_std: float = 0.02 + scale_embedding: bool = True + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + add_cross_attention: bool = False @auto_docstring(checkpoint="microsoft/kosmos-2-patch14-224") +@strict(accept_kwargs=True) class Kosmos2VisionConfig(PreTrainedConfig): model_type = "kosmos_2_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1024, - intermediate_size=4096, - num_hidden_layers=24, - num_attention_heads=16, - num_channels=3, - image_size=224, - patch_size=14, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act + hidden_size: int = 1024 + intermediate_size: int = 4096 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 14 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 @auto_docstring(checkpoint="microsoft/kosmos-2-patch14-224") +@strict(accept_kwargs=True) class Kosmos2Config(PreTrainedConfig): r""" latent_query_num (`int`, *optional*, defaults to 64): @@ -143,31 +105,25 @@ class Kosmos2Config(PreTrainedConfig): model_type = "kosmos-2" sub_configs = {"text_config": Kosmos2TextConfig, "vision_config": Kosmos2VisionConfig} - def __init__( - self, - text_config=None, - vision_config=None, - latent_query_num=64, - tie_word_embeddings=True, - **kwargs, - ): - if text_config is None: - text_config = Kosmos2TextConfig() + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + latent_query_num: int = 64 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = Kosmos2TextConfig() logger.info("`text_config` is `None`. initializing the `Kosmos2TextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = Kosmos2TextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = Kosmos2TextConfig(**self.text_config) - if vision_config is None: - vision_config = Kosmos2VisionConfig() + if self.vision_config is None: + self.vision_config = Kosmos2VisionConfig() logger.info("`vision_config` is `None`. initializing the `Kosmos2VisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = Kosmos2VisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - self.latent_query_num = latent_query_num - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + elif isinstance(self.vision_config, dict): + self.vision_config = Kosmos2VisionConfig(**self.vision_config) + + super().__post_init__(**kwargs) __all__ = ["Kosmos2Config"] diff --git a/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py b/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py index 4ebf5353f4f4..b0cbe9551f66 100644 --- a/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py +++ b/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py @@ -13,6 +13,8 @@ # limitations under the License. """KOSMOS-2.5 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="microsoft/kosmos-2.5") +@strict(accept_kwargs=True) class Kosmos2_5TextConfig(PreTrainedConfig): r""" activation_dropout (`float`, *optional*, defaults to 0.0): @@ -36,51 +39,28 @@ class Kosmos2_5TextConfig(PreTrainedConfig): "num_hidden_layers": "layers", } - def __init__( - self, - vocab_size=108481, - max_position_embeddings=4096, - embed_dim=1536, - layers=24, - ffn_dim=6144, - attention_heads=16, - activation_function="gelu", - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - layerdrop=0.0, - layer_norm_eps=1e-5, - init_std=0.02, - scale_embedding=True, - use_cache=True, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.embed_dim = embed_dim - self.layers = layers - self.ffn_dim = ffn_dim - self.attention_heads = attention_heads - self.activation_function = activation_function - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.layerdrop = layerdrop - self.layer_norm_eps = layer_norm_eps - self.init_std = init_std - self.scale_embedding = scale_embedding - self.use_cache = use_cache + vocab_size: int = 108481 + max_position_embeddings: int = 4096 + embed_dim: int = 1536 + layers: int = 24 + ffn_dim: int = 6144 + attention_heads: int = 16 + activation_function: str = "gelu" + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + layerdrop: float | int = 0.0 + layer_norm_eps: float = 1e-5 + init_std: float = 0.02 + scale_embedding: bool = True + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 @auto_docstring(checkpoint="microsoft/kosmos-2.5") +@strict(accept_kwargs=True) class Kosmos2_5VisionConfig(PreTrainedConfig): r""" dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`): @@ -109,41 +89,23 @@ class Kosmos2_5VisionConfig(PreTrainedConfig): model_type = "kosmos_2_5_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1536, - patch_embed_hidden_size=768, - intermediate_size=3968, - head_dim=64, - num_hidden_layers=18, - num_attention_heads=24, - dense_act_fn="gelu_new", - layer_norm_eps=1e-6, - dropout_rate=0.0, - attention_dropout=0.0, - max_num_patches=4096, - initializer_factor=1.0, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.patch_embed_hidden_size = patch_embed_hidden_size - self.intermediate_size = intermediate_size - self.dropout_rate = dropout_rate - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.dense_act_fn = dense_act_fn - self.max_num_patches = max_num_patches - self.head_dim = head_dim - self.initializer_factor = initializer_factor - self.initializer_range = initializer_range + hidden_size: int = 1536 + patch_embed_hidden_size: int = 768 + intermediate_size: int = 3968 + head_dim: int = 64 + num_hidden_layers: int = 18 + num_attention_heads: int = 24 + dense_act_fn: str = "gelu_new" + layer_norm_eps: float = 1e-6 + dropout_rate: float = 0.0 + attention_dropout: float | int = 0.0 + max_num_patches: int = 4096 + initializer_factor: float = 1.0 + initializer_range: float = 0.02 @auto_docstring(checkpoint="microsoft/kosmos-2.5") +@strict(accept_kwargs=True) class Kosmos2_5Config(PreTrainedConfig): r""" latent_query_num (`int`, *optional*, defaults to 2048): @@ -153,31 +115,25 @@ class Kosmos2_5Config(PreTrainedConfig): model_type = "kosmos-2.5" sub_configs = {"text_config": Kosmos2_5TextConfig, "vision_config": Kosmos2_5VisionConfig} - def __init__( - self, - text_config=None, - vision_config=None, - latent_query_num=2048, - tie_word_embeddings=True, - **kwargs, - ): - if text_config is None: - text_config = Kosmos2_5TextConfig() + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + latent_query_num: int = 2048 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = Kosmos2_5TextConfig() logger.info("`text_config` is `None`. initializing the `Kosmos2_5TextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = Kosmos2_5TextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = Kosmos2_5TextConfig(**self.text_config) - if vision_config is None: - vision_config = Kosmos2_5VisionConfig() + if self.vision_config is None: + self.vision_config = Kosmos2_5VisionConfig() logger.info("`vision_config` is `None`. initializing the `Kosmos2_5VisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = Kosmos2_5VisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - self.latent_query_num = latent_query_num - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + elif isinstance(self.vision_config, dict): + self.vision_config = Kosmos2_5VisionConfig(**self.vision_config) + + super().__post_init__(**kwargs) __all__ = ["Kosmos2_5Config"] diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index cbe73530c41c..188b0b6cb65d 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -13,6 +13,8 @@ # limitations under the License.s +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="kyutai/stt-2.6b-en-trfs") +@strict(accept_kwargs=True) class KyutaiSpeechToTextConfig(PreTrainedConfig): r""" codebook_vocab_size (`int`, *optional*, defaults to 2049): @@ -58,77 +61,49 @@ class KyutaiSpeechToTextConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] sub_configs = {"codec_config": AutoConfig} - def __init__( - self, - codebook_vocab_size: int | None = 2049, - vocab_size: int | None = 4001, - hidden_size: int | None = 2048, - num_hidden_layers: int | None = 48, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - max_position_embeddings: int | None = 750, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - hidden_act: str | None = "silu", - head_dim: int | None = None, - initializer_range: float | None = 0.02, - use_cache: bool | None = True, - sliding_window: int | None = 375, - attention_dropout: float | None = 0.0, - ffn_dim: int | None = 11264, - rms_norm_eps: int | None = 1e-8, - num_codebooks: int | None = 32, - audio_bos_token_id: int | None = 2048, - audio_pad_token_id: int | None = 69569, - tie_word_embeddings: bool | None = False, - pad_token_id: int | None = 3, - bos_token_id: int | None = 48000, - eos_token_id: int | None = None, - codec_config: dict | None = None, - **kwargs, - ): - if codec_config is None: + codebook_vocab_size: int = 2049 + vocab_size: int = 4001 + hidden_size: int = 2048 + num_hidden_layers: int = 48 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + max_position_embeddings: int = 750 + rope_parameters: RopeParameters | dict | None = None + hidden_act: str = "silu" + head_dim: int | None = None + initializer_range: float = 0.02 + use_cache: bool = True + sliding_window: int = 375 + attention_dropout: float | int = 0.0 + ffn_dim: int = 11264 + rms_norm_eps: float = 1e-8 + num_codebooks: int = 32 + audio_bos_token_id: int | None = 2048 + audio_pad_token_id: int | None = 69569 + tie_word_embeddings: bool = False + pad_token_id: int | None = 3 + bos_token_id: int | None = 48000 + eos_token_id: int | list[int] | None = None + codec_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): + if self.codec_config is None: self.codec_config = AutoConfig.for_model("mimi") logger.info("codec_config is None, using default audio encoder config.") - elif isinstance(codec_config, dict): - self.codec_config = AutoConfig.for_model(**codec_config) - elif isinstance(codec_config, PreTrainedConfig): - self.codec_config = codec_config + elif isinstance(self.codec_config, dict): + self.codec_config = AutoConfig.for_model(**self.codec_config) + + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.num_codebooks = num_codebooks self.frame_size = self.codec_config.frame_size + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + super().__post_init__(**kwargs) - self.audio_bos_token_id = audio_bos_token_id - self.audio_pad_token_id = audio_pad_token_id - self.codebook_vocab_size = codebook_vocab_size - - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - if ffn_dim % 2 == 1: - raise ValueError(f"`ffn_dim={ffn_dim}` must be even.") - self.ffn_dim = ffn_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.sliding_window = sliding_window - self.rope_parameters = rope_parameters - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.ffn_dim % 2 == 1: + raise ValueError(f"`ffn_dim={self.ffn_dim}` must be even.") __all__ = ["KyutaiSpeechToTextConfig"] diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py index 95accc95bf3f..ea4a937781f4 100644 --- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py @@ -835,7 +835,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if self.gradient_checkpointing and self.training and use_cache: logger.warning_once( diff --git a/src/transformers/models/lasr/configuration_lasr.py b/src/transformers/models/lasr/configuration_lasr.py index 07b57ba4282b..5a9a0cffd804 100644 --- a/src/transformers/models/lasr/configuration_lasr.py +++ b/src/transformers/models/lasr/configuration_lasr.py @@ -18,11 +18,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="google/medasr") +@strict(accept_kwargs=True) class LasrEncoderConfig(PreTrainedConfig): r""" convolution_bias (`bool`, *optional*, defaults to `False`): @@ -65,68 +68,38 @@ class LasrEncoderConfig(PreTrainedConfig): model_type = "lasr_encoder" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - hidden_size=512, - num_hidden_layers=17, - num_attention_heads=8, - intermediate_size=2048, - hidden_act="silu", - attention_bias=False, - convolution_bias=False, - conv_kernel_size=32, - subsampling_conv_channels=256, - subsampling_conv_kernel_size=5, - subsampling_conv_stride=2, - num_mel_bins=128, - dropout=0.1, - dropout_positions=0.0, - layerdrop=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - max_position_embeddings=10000, - initializer_range=0.02, - layer_norm_eps=1e-6, - feed_forward_residual_weights=[1.5, 0.5], - conv_residual_weights=[2.0, 1.0], - batch_norm_momentum=0.01, - rope_parameters=None, - **kwargs, - ): - self.rope_parameters = rope_parameters - self.layer_norm_eps = layer_norm_eps - self.feed_forward_residual_weights = feed_forward_residual_weights - self.conv_residual_weights = conv_residual_weights - self.batch_norm_momentum = batch_norm_momentum - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_attention_heads # LlamaAttention compatibility - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.attention_bias = attention_bias - self.convolution_bias = convolution_bias - - self.conv_kernel_size = conv_kernel_size - self.subsampling_conv_kernel_size = subsampling_conv_kernel_size - self.subsampling_conv_stride = subsampling_conv_stride - self.subsampling_conv_channels = subsampling_conv_channels - self.num_mel_bins = num_mel_bins - - self.dropout = dropout - self.dropout_positions = dropout_positions - self.layerdrop = layerdrop - self.activation_dropout = activation_dropout - self.attention_dropout = attention_dropout - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - - super().__init__( - **kwargs, - ) + hidden_size: int = 512 + num_hidden_layers: int = 17 + num_attention_heads: int = 8 + intermediate_size: int = 2048 + hidden_act: str = "silu" + attention_bias: bool = False + convolution_bias: bool = False + conv_kernel_size: int = 32 + subsampling_conv_channels: int = 256 + num_mel_bins: int = 128 + subsampling_conv_kernel_size: int = 5 + subsampling_conv_stride: int = 2 + dropout: float | int = 0.1 + dropout_positions: float = 0.0 + layerdrop: float | int = 0.1 + activation_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + max_position_embeddings: int = 10000 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + feed_forward_residual_weights: list[float] | tuple[float, ...] = (1.5, 0.5) + conv_residual_weights: list[float] | tuple[float, ...] = (2.0, 1.0) + batch_norm_momentum: float = 0.01 + rope_parameters: dict | None = None + + def __post_init__(self, **kwargs): + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) @auto_docstring(checkpoint="google/medasr") +@strict(accept_kwargs=True) class LasrCTCConfig(PreTrainedConfig): r""" ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`): @@ -154,40 +127,19 @@ class LasrCTCConfig(PreTrainedConfig): model_type = "lasr_ctc" sub_configs = {"encoder_config": LasrEncoderConfig} - def __init__( - self, - vocab_size=512, - ctc_loss_reduction="mean", - ctc_zero_infinity=True, - encoder_config: dict | LasrEncoderConfig = None, - pad_token_id=0, - **kwargs, - ): - self.vocab_size = vocab_size - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - - if isinstance(encoder_config, dict): - self.encoder_config = LasrEncoderConfig(**encoder_config) - elif encoder_config is None: - self.encoder_config = LasrEncoderConfig() + vocab_size: int = 512 + ctc_loss_reduction: str = "mean" + ctc_zero_infinity: bool = True + encoder_config: dict | PreTrainedConfig | None = None + pad_token_id: int = 0 - self.encoder_config = self.encoder_config + def __post_init__(self, **kwargs): + if isinstance(self.encoder_config, dict): + self.encoder_config = LasrEncoderConfig(**self.encoder_config) + elif self.encoder_config is None: + self.encoder_config = LasrEncoderConfig() self.initializer_range = self.encoder_config.initializer_range - self.pad_token_id = pad_token_id - - super().__init__(**kwargs) - - @classmethod - def from_encoder_config(cls, encoder_config: LasrEncoderConfig, **kwargs): - r""" - Instantiate a [`LasrCTCConfig`] (or a derived class) from lasr encoder model configuration. - - Returns: - [`LasrCTCConfig`]: An instance of a configuration object - """ - - return cls(encoder_config=encoder_config.to_dict(), **kwargs) + super().__post_init__(**kwargs) @property def inputs_to_logits_ratio(self): diff --git a/src/transformers/models/lasr/modular_lasr.py b/src/transformers/models/lasr/modular_lasr.py index 68d414bbbe3d..3ece6717f2a1 100644 --- a/src/transformers/models/lasr/modular_lasr.py +++ b/src/transformers/models/lasr/modular_lasr.py @@ -16,6 +16,7 @@ from collections.abc import Callable import torch +from huggingface_hub.dataclasses import strict from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import Unigram from torch import nn @@ -148,6 +149,7 @@ class LasrProcessor(ParakeetProcessor): @auto_docstring(checkpoint="google/medasr") +@strict(accept_kwargs=True) class LasrEncoderConfig(ParakeetEncoderConfig): r""" convolution_bias (`bool`, *optional*, defaults to `False`): @@ -187,68 +189,27 @@ class LasrEncoderConfig(ParakeetEncoderConfig): and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO). """ - def __init__( - self, - hidden_size=512, - num_hidden_layers=17, - num_attention_heads=8, - intermediate_size=2048, - hidden_act="silu", - attention_bias=False, - convolution_bias=False, - conv_kernel_size=32, - subsampling_conv_channels=256, - subsampling_conv_kernel_size=5, - subsampling_conv_stride=2, - num_mel_bins=128, - dropout=0.1, - dropout_positions=0.0, - layerdrop=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - max_position_embeddings=10000, - initializer_range=0.02, - layer_norm_eps=1e-6, - feed_forward_residual_weights=[1.5, 0.5], - conv_residual_weights=[2.0, 1.0], - batch_norm_momentum=0.01, - rope_parameters=None, - **kwargs, - ): - self.rope_parameters = rope_parameters - self.layer_norm_eps = layer_norm_eps - self.feed_forward_residual_weights = feed_forward_residual_weights - self.conv_residual_weights = conv_residual_weights - self.batch_norm_momentum = batch_norm_momentum - - super().__init__( - hidden_size=hidden_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - intermediate_size=intermediate_size, - hidden_act=hidden_act, - attention_bias=attention_bias, - convolution_bias=convolution_bias, - conv_kernel_size=conv_kernel_size, - subsampling_conv_channels=subsampling_conv_channels, - num_mel_bins=num_mel_bins, - subsampling_conv_kernel_size=subsampling_conv_kernel_size, - subsampling_conv_stride=subsampling_conv_stride, - dropout=dropout, - dropout_positions=dropout_positions, - layerdrop=layerdrop, - activation_dropout=activation_dropout, - attention_dropout=attention_dropout, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - **kwargs, - ) - - del self.subsampling_factor - del self.scale_input + hidden_size: int = 512 + num_hidden_layers: int = 17 + intermediate_size: int = 2048 + attention_bias: bool = False + convolution_bias: bool = False + conv_kernel_size: int = 32 + subsampling_conv_kernel_size: int = 5 + num_mel_bins: int = 128 + max_position_embeddings: int = 10000 + layer_norm_eps: float = 1e-6 + feed_forward_residual_weights: list[float] | tuple[float, ...] = (1.5, 0.5) + conv_residual_weights: list[float] | tuple[float, ...] = (2.0, 1.0) + batch_norm_momentum: float = 0.01 + rope_parameters: dict | None = None + + subsampling_factor = AttributeError() + scale_input = AttributeError() @auto_docstring(checkpoint="google/medasr") +@strict(accept_kwargs=True) class LasrCTCConfig(ParakeetCTCConfig): r""" ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`): @@ -273,23 +234,8 @@ class LasrCTCConfig(ParakeetCTCConfig): and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO). """ - def __init__( - self, - vocab_size=512, - ctc_loss_reduction="mean", - ctc_zero_infinity=True, - encoder_config: dict | LasrEncoderConfig = None, - pad_token_id=0, - **kwargs, - ): - super().__init__( - vocab_size=vocab_size, - ctc_loss_reduction=ctc_loss_reduction, - ctc_zero_infinity=ctc_zero_infinity, - encoder_config=encoder_config, - pad_token_id=pad_token_id, - **kwargs, - ) + vocab_size: int = 512 + pad_token_id: int = 0 @property def inputs_to_logits_ratio(self): diff --git a/src/transformers/models/layoutlm/configuration_layoutlm.py b/src/transformers/models/layoutlm/configuration_layoutlm.py index 580e35b5a01f..394ea788d6bc 100644 --- a/src/transformers/models/layoutlm/configuration_layoutlm.py +++ b/src/transformers/models/layoutlm/configuration_layoutlm.py @@ -13,14 +13,14 @@ # limitations under the License. """LayoutLM model configuration""" -from ... import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ... import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/layoutlm-base-uncased") +@strict(accept_kwargs=True) class LayoutLMConfig(PreTrainedConfig): r""" max_2d_position_embeddings (`int`, *optional*, defaults to 1024): @@ -44,47 +44,24 @@ class LayoutLMConfig(PreTrainedConfig): model_type = "layoutlm" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - eos_token_id=None, - bos_token_id=None, - use_cache=True, - max_2d_position_embeddings=1024, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.bos_token_id = bos_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.max_2d_position_embeddings = max_2d_position_embeddings - self.tie_word_embeddings = tie_word_embeddings + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = None + bos_token_id: int | None = None + use_cache: bool = True + max_2d_position_embeddings: int = 1024 + tie_word_embeddings: bool = True __all__ = ["LayoutLMConfig"] diff --git a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py index 5d4a08c91f7b..b09233b4f3a0 100644 --- a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py @@ -13,11 +13,10 @@ # limitations under the License. """LayoutLMv2 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, is_detectron2_available, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring, is_detectron2_available # soft dependency @@ -26,6 +25,7 @@ @auto_docstring(checkpoint="microsoft/layoutlmv2-base-uncased") +@strict(accept_kwargs=True) class LayoutLMv2Config(PreTrainedConfig): r""" max_2d_position_embeddings (`int`, *optional*, defaults to 1024): @@ -77,66 +77,40 @@ class LayoutLMv2Config(PreTrainedConfig): model_type = "layoutlmv2" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - max_2d_position_embeddings=1024, - max_rel_pos=128, - rel_pos_bins=32, - fast_qkv=True, - max_rel_2d_pos=256, - rel_2d_pos_bins=64, - convert_sync_batchnorm=True, - image_feature_pool_shape=[7, 7, 256], - coordinate_size=128, - shape_size=128, - has_relative_attention_bias=True, - has_spatial_attention_bias=True, - has_visual_segment_embedding=False, - detectron2_config_args=None, - **kwargs, - ): - super().__init__(**kwargs) - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.pad_token_id = pad_token_id - self.max_2d_position_embeddings = max_2d_position_embeddings - self.max_rel_pos = max_rel_pos - self.rel_pos_bins = rel_pos_bins - self.fast_qkv = fast_qkv - self.max_rel_2d_pos = max_rel_2d_pos - self.rel_2d_pos_bins = rel_2d_pos_bins - self.convert_sync_batchnorm = convert_sync_batchnorm - self.image_feature_pool_shape = image_feature_pool_shape - self.coordinate_size = coordinate_size - self.shape_size = shape_size - self.has_relative_attention_bias = has_relative_attention_bias - self.has_spatial_attention_bias = has_spatial_attention_bias - self.has_visual_segment_embedding = has_visual_segment_embedding + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + max_2d_position_embeddings: int = 1024 + max_rel_pos: int = 128 + rel_pos_bins: int = 32 + fast_qkv: bool = True + max_rel_2d_pos: int = 256 + rel_2d_pos_bins: int = 64 + convert_sync_batchnorm: bool = True + image_feature_pool_shape: list[int] | tuple[int, ...] = (7, 7, 256) + coordinate_size: int = 128 + shape_size: int = 128 + has_relative_attention_bias: bool = True + has_spatial_attention_bias: bool = True + has_visual_segment_embedding: bool = False + detectron2_config_args: dict | None = None + + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) self.detectron2_config_args = ( - detectron2_config_args if detectron2_config_args is not None else self.get_default_detectron2_config() + self.detectron2_config_args + if self.detectron2_config_args is not None + else self.get_default_detectron2_config() ) @classmethod diff --git a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py index c4686c46d9b3..4ee3e7f4e9bb 100644 --- a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py @@ -13,14 +13,14 @@ # limitations under the License. """LayoutLMv3 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/layoutlmv3-base") +@strict(accept_kwargs=True) class LayoutLMv3Config(PreTrainedConfig): r""" max_2d_position_embeddings (`int`, *optional*, defaults to 1024): @@ -66,71 +66,36 @@ class LayoutLMv3Config(PreTrainedConfig): model_type = "layoutlmv3" - def __init__( - self, - vocab_size=50265, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-5, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - max_2d_position_embeddings=1024, - coordinate_size=128, - shape_size=128, - has_relative_attention_bias=True, - rel_pos_bins=32, - max_rel_pos=128, - rel_2d_pos_bins=64, - max_rel_2d_pos=256, - has_spatial_attention_bias=True, - text_embed=True, - visual_embed=True, - input_size=224, - num_channels=3, - patch_size=16, - classifier_dropout=None, - **kwargs, - ): - super().__init__(**kwargs) - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.max_2d_position_embeddings = max_2d_position_embeddings - self.coordinate_size = coordinate_size - self.shape_size = shape_size - self.has_relative_attention_bias = has_relative_attention_bias - self.rel_pos_bins = rel_pos_bins - self.max_rel_pos = max_rel_pos - self.has_spatial_attention_bias = has_spatial_attention_bias - self.rel_2d_pos_bins = rel_2d_pos_bins - self.max_rel_2d_pos = max_rel_2d_pos - self.text_embed = text_embed - self.visual_embed = visual_embed - self.input_size = input_size - self.num_channels = num_channels - self.patch_size = patch_size - self.classifier_dropout = classifier_dropout + vocab_size: int = 50265 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + max_2d_position_embeddings: int = 1024 + coordinate_size: int = 128 + shape_size: int = 128 + has_relative_attention_bias: bool = True + rel_pos_bins: int = 32 + max_rel_pos: int = 128 + rel_2d_pos_bins: int = 64 + max_rel_2d_pos: int = 256 + has_spatial_attention_bias: bool = True + text_embed: bool = True + visual_embed: bool = True + input_size: int = 224 + num_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 16 + classifier_dropout: float | int | None = None __all__ = ["LayoutLMv3Config"] diff --git a/src/transformers/models/layoutxlm/configuration_layoutxlm.py b/src/transformers/models/layoutxlm/configuration_layoutxlm.py index 76b5d8c6136d..6f874b99182b 100644 --- a/src/transformers/models/layoutxlm/configuration_layoutxlm.py +++ b/src/transformers/models/layoutxlm/configuration_layoutxlm.py @@ -18,6 +18,9 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, is_detectron2_available @@ -28,6 +31,7 @@ @auto_docstring(checkpoint="microsoft/layoutxlm-base") +@strict(accept_kwargs=True) class LayoutXLMConfig(PreTrainedConfig): r""" max_2d_position_embeddings (`int`, *optional*, defaults to 1024): @@ -79,66 +83,40 @@ class LayoutXLMConfig(PreTrainedConfig): model_type = "layoutxlm" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - max_2d_position_embeddings=1024, - max_rel_pos=128, - rel_pos_bins=32, - fast_qkv=True, - max_rel_2d_pos=256, - rel_2d_pos_bins=64, - convert_sync_batchnorm=True, - image_feature_pool_shape=[7, 7, 256], - coordinate_size=128, - shape_size=128, - has_relative_attention_bias=True, - has_spatial_attention_bias=True, - has_visual_segment_embedding=False, - detectron2_config_args=None, - **kwargs, - ): - super().__init__(**kwargs) - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.pad_token_id = pad_token_id - self.max_2d_position_embeddings = max_2d_position_embeddings - self.max_rel_pos = max_rel_pos - self.rel_pos_bins = rel_pos_bins - self.fast_qkv = fast_qkv - self.max_rel_2d_pos = max_rel_2d_pos - self.rel_2d_pos_bins = rel_2d_pos_bins - self.convert_sync_batchnorm = convert_sync_batchnorm - self.image_feature_pool_shape = image_feature_pool_shape - self.coordinate_size = coordinate_size - self.shape_size = shape_size - self.has_relative_attention_bias = has_relative_attention_bias - self.has_spatial_attention_bias = has_spatial_attention_bias - self.has_visual_segment_embedding = has_visual_segment_embedding + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + max_2d_position_embeddings: int = 1024 + max_rel_pos: int = 128 + rel_pos_bins: int = 32 + fast_qkv: bool = True + max_rel_2d_pos: int = 256 + rel_2d_pos_bins: int = 64 + convert_sync_batchnorm: bool = True + image_feature_pool_shape: list[int] | tuple[int, ...] = (7, 7, 256) + coordinate_size: int = 128 + shape_size: int = 128 + has_relative_attention_bias: bool = True + has_spatial_attention_bias: bool = True + has_visual_segment_embedding: bool = False + detectron2_config_args: dict | None = None + + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) self.detectron2_config_args = ( - detectron2_config_args if detectron2_config_args is not None else self.get_default_detectron2_config() + self.detectron2_config_args + if self.detectron2_config_args is not None + else self.get_default_detectron2_config() ) @classmethod diff --git a/src/transformers/models/layoutxlm/modular_layoutxlm.py b/src/transformers/models/layoutxlm/modular_layoutxlm.py index 479956d1875c..35749b1e6cab 100644 --- a/src/transformers/models/layoutxlm/modular_layoutxlm.py +++ b/src/transformers/models/layoutxlm/modular_layoutxlm.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...utils import auto_docstring from ..layoutlmv2.configuration_layoutlmv2 import LayoutLMv2Config @auto_docstring(checkpoint="microsoft/layoutxlm-base") +@strict(accept_kwargs=True) class LayoutXLMConfig(LayoutLMv2Config): r""" max_2d_position_embeddings (`int`, *optional*, defaults to 1024): diff --git a/src/transformers/models/led/configuration_led.py b/src/transformers/models/led/configuration_led.py index 3194ad8b9df9..ffe3314c4ade 100644 --- a/src/transformers/models/led/configuration_led.py +++ b/src/transformers/models/led/configuration_led.py @@ -13,14 +13,14 @@ # limitations under the License. """LED model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="allenai/led-base-16384") +@strict(accept_kwargs=True) class LEDConfig(PreTrainedConfig): r""" attention_window (`int` or `list[int]`, *optional*, defaults to 512): @@ -52,66 +52,35 @@ class LEDConfig(PreTrainedConfig): "hidden_size": "d_model", "attention_probs_dropout_prob": "attention_dropout", "initializer_range": "init_std", + "num_hidden_layers": "encoder_layers", } - def __init__( - self, - vocab_size=50265, - max_encoder_position_embeddings=16384, - max_decoder_position_embeddings=1024, - encoder_layers=12, - encoder_ffn_dim=4096, - encoder_attention_heads=16, - decoder_layers=12, - decoder_ffn_dim=4096, - decoder_attention_heads=16, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - use_cache=True, - is_encoder_decoder=True, - activation_function="gelu", - d_model=1024, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - decoder_start_token_id=2, - classifier_dropout=0.0, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - attention_window: list[int] | int = 512, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_encoder_position_embeddings = max_encoder_position_embeddings - self.max_decoder_position_embeddings = max_decoder_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.classifier_dropout = classifier_dropout - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.attention_window = attention_window - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + vocab_size: int = 50265 + max_encoder_position_embeddings: int = 16384 + max_decoder_position_embeddings: int = 1024 + encoder_layers: int = 12 + encoder_ffn_dim: int = 4096 + encoder_attention_heads: int = 16 + decoder_layers: int = 12 + decoder_ffn_dim: int = 4096 + decoder_attention_heads: int = 16 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "gelu" + d_model: int = 1024 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + decoder_start_token_id: int = 2 + classifier_dropout: float | int = 0.0 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + attention_window: list[int] | int = 512 + tie_word_embeddings: bool = True __all__ = ["LEDConfig"] diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index 45edbc9ab8d3..8fac15de5518 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -1426,7 +1426,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # check input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: @@ -1642,7 +1642,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: @@ -1828,7 +1828,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Using this like Bart, as LED is derived from it. So far # No checkpoint on the hub exists that uses that in practice. @@ -2029,7 +2029,7 @@ def forward( >>> print(tokenizer.decode(prediction, skip_special_tokens=True)) ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: if use_cache: @@ -2146,7 +2146,7 @@ def forward( - 0 for local attention (a sliding window attention), - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if start_positions is not None and end_positions is not None: use_cache = False diff --git a/src/transformers/models/levit/configuration_levit.py b/src/transformers/models/levit/configuration_levit.py index 97f8f74f94a2..6f95cc303c41 100644 --- a/src/transformers/models/levit/configuration_levit.py +++ b/src/transformers/models/levit/configuration_levit.py @@ -13,14 +13,14 @@ # limitations under the License. """LeViT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/levit-128S") +@strict(accept_kwargs=True) class LevitConfig(PreTrainedConfig): r""" stride (`int`, *optional*, defaults to 2): @@ -52,43 +52,27 @@ class LevitConfig(PreTrainedConfig): model_type = "levit" - def __init__( - self, - image_size=224, - num_channels=3, - kernel_size=3, - stride=2, - padding=1, - patch_size=16, - hidden_sizes=[128, 256, 384], - num_attention_heads=[4, 8, 12], - depths=[4, 4, 4], - key_dim=[16, 16, 16], - drop_path_rate=0, - mlp_ratio=[2, 2, 2], - attention_ratio=[2, 2, 2], - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.image_size = image_size - self.num_channels = num_channels - self.kernel_size = kernel_size - self.stride = stride - self.padding = padding - self.hidden_sizes = hidden_sizes - self.num_attention_heads = num_attention_heads - self.depths = depths - self.key_dim = key_dim - self.drop_path_rate = drop_path_rate - self.patch_size = patch_size - self.attention_ratio = attention_ratio - self.mlp_ratio = mlp_ratio - self.initializer_range = initializer_range + image_size: int | list[int] | tuple[int, int] = 224 + num_channels: int = 3 + kernel_size: int = 3 + stride: int = 2 + padding: int = 1 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_sizes: list[int] | tuple[int, ...] = (128, 256, 384) + num_attention_heads: list[int] | tuple[int, ...] = (4, 8, 12) + depths: list[int] | tuple[int, ...] = (4, 4, 4) + key_dim: list[int] | tuple[int, ...] = (16, 16, 16) + drop_path_rate: int = 0 + mlp_ratio: list[int] | tuple[int, ...] = (2, 2, 2) + attention_ratio: list[int] | tuple[int, ...] = (2, 2, 2) + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): self.down_ops = [ - ["Subsample", key_dim[0], hidden_sizes[0] // key_dim[0], 4, 2, 2], - ["Subsample", key_dim[0], hidden_sizes[1] // key_dim[0], 4, 2, 2], + ["Subsample", self.key_dim[0], self.hidden_sizes[0] // self.key_dim[0], 4, 2, 2], + ["Subsample", self.key_dim[0], self.hidden_sizes[1] // self.key_dim[0], 4, 2, 2], ] + super().__post_init__(**kwargs) __all__ = ["LevitConfig"] diff --git a/src/transformers/models/levit/modeling_levit.py b/src/transformers/models/levit/modeling_levit.py index fa3bdc8b7c13..2d334afe0cb7 100644 --- a/src/transformers/models/levit/modeling_levit.py +++ b/src/transformers/models/levit/modeling_levit.py @@ -510,7 +510,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -575,7 +575,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) @@ -636,7 +636,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple | LevitForImageClassificationWithTeacherOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) diff --git a/src/transformers/models/lfm2/configuration_lfm2.py b/src/transformers/models/lfm2/configuration_lfm2.py index 60410f1b84c5..2109ded4488f 100644 --- a/src/transformers/models/lfm2/configuration_lfm2.py +++ b/src/transformers/models/lfm2/configuration_lfm2.py @@ -12,12 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="LiquidAI/LFM2-1.2B") +@strict(accept_kwargs=True) class Lfm2Config(PreTrainedConfig): r""" conv_bias (`bool`, *optional*, defaults to `False`): @@ -51,66 +55,41 @@ class Lfm2Config(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] default_theta = 1000000.0 - def __init__( - self, - vocab_size: int | None = 65536, - hidden_size: int | None = 2560, - intermediate_size: int | None = 12288, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - max_position_embeddings: int | None = 128_000, - initializer_range: float | None = 0.02, - norm_eps: float | None = 0.00001, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - conv_bias: bool | None = False, - conv_L_cache: int | None = 3, - block_multiple_of: int | None = 256, - block_ffn_dim_multiplier: float | None = 1.0, - block_auto_adjust_ff_dim: bool | None = True, - full_attn_idxs: list[int] | None = None, - layer_types: list[str] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.max_position_embeddings = max_position_embeddings - self.use_cache = use_cache - self.norm_eps = norm_eps - self.initializer_range = initializer_range - - # attn operator config - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - - # custom operator config - self.conv_bias = conv_bias - self.conv_L_cache = conv_L_cache - - # MLP config - self.intermediate_size = kwargs.get("block_ff_dim", intermediate_size) # to fit original config keys - self.block_multiple_of = block_multiple_of - self.block_ffn_dim_multiplier = block_ffn_dim_multiplier - self.block_auto_adjust_ff_dim = block_auto_adjust_ff_dim - - self.layer_types = layer_types + vocab_size: int = 65536 + hidden_size: int = 2560 + intermediate_size: int = 12288 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + max_position_embeddings: int = 128_000 + initializer_range: float = 0.02 + norm_eps: float = 0.00001 + use_cache: bool = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + conv_bias: bool = False + conv_L_cache: int = 3 + block_multiple_of: int = 256 + block_ffn_dim_multiplier: float = 1.0 + block_auto_adjust_ff_dim: bool = True + full_attn_idxs: list[int] | None = None + layer_types: list[str] | None = None + + def __post_init__(self, **kwargs): if self.layer_types is None: - full_attn_idxs = full_attn_idxs if full_attn_idxs is not None else list(range(num_hidden_layers)) - self.layer_types = ["full_attention" if i in full_attn_idxs else "conv" for i in range(num_hidden_layers)] - - self.rope_parameters = rope_parameters - tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + self.full_attn_idxs = ( + self.full_attn_idxs if self.full_attn_idxs is not None else list(range(self.num_hidden_layers)) + ) + self.layer_types = [ + "full_attention" if i in self.full_attn_idxs else "conv" for i in range(self.num_hidden_layers) + ] + + self.tie_word_embeddings = kwargs.pop("tie_embedding", self.tie_word_embeddings) + self.intermediate_size = kwargs.pop("block_ff_dim", self.intermediate_size) + super().__post_init__(**kwargs) __all__ = ["Lfm2Config"] diff --git a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py index 32c29cbf8918..5901c86fe983 100644 --- a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py +++ b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py @@ -12,12 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="LiquidAI/LFM2-8B-A1B") +@strict(accept_kwargs=True) class Lfm2MoeConfig(PreTrainedConfig): r""" conv_bias (`bool`, *optional*, defaults to `False`): @@ -46,70 +49,35 @@ class Lfm2MoeConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] default_theta = 1000000.0 - def __init__( - self, - vocab_size: int = 65536, - hidden_size: int = 2048, - intermediate_size: int = 7168, - moe_intermediate_size: int = 1792, - num_hidden_layers: int = 32, - pad_token_id: int = 0, - bos_token_id: int = 1, - eos_token_id: int = 2, - tie_word_embeddings: bool = True, - rope_parameters: RopeParameters = None, - max_position_embeddings: int = 128_000, - initializer_range: float = 0.02, - use_cache: bool = True, - norm_eps: float = 0.00001, - num_attention_heads: int = 32, - num_key_value_heads: int = 8, - conv_bias: bool = False, - conv_L_cache: int = 3, - num_dense_layers: int = 2, - num_experts_per_tok: int = 4, - num_experts: int = 32, - use_expert_bias: bool = True, - routed_scaling_factor: float = 1.0, - norm_topk_prob: bool = True, - layer_types: list[str] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.use_cache = use_cache - self.norm_eps = norm_eps - - # attn operator config - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - - # custom operator config - self.conv_bias = conv_bias - self.conv_L_cache = conv_L_cache - - # moe config - self.num_dense_layers = num_dense_layers - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.use_expert_bias = use_expert_bias - self.routed_scaling_factor = routed_scaling_factor - self.norm_topk_prob = norm_topk_prob - self.layer_types = layer_types - self.initializer_range = initializer_range + vocab_size: int = 65536 + hidden_size: int = 2048 + intermediate_size: int = 7168 + moe_intermediate_size: int = 1792 + num_hidden_layers: int = 32 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = True + rope_parameters: dict | None = None + max_position_embeddings: int = 128_000 + initializer_range: float = 0.02 + use_cache: bool = True + norm_eps: float = 0.00001 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + conv_bias: bool = False + conv_L_cache: int = 3 + num_dense_layers: int = 2 + num_experts_per_tok: int = 4 + num_experts: int = 32 + use_expert_bias: bool = True + routed_scaling_factor: float = 1.0 + norm_topk_prob: bool = True + layer_types: list[str] | None = None - self.rope_parameters = rope_parameters - tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + def __post_init__(self, **kwargs): + self.tie_word_embeddings = kwargs.pop("tie_embedding", self.tie_word_embeddings) + super().__post_init__(**kwargs) __all__ = ["Lfm2MoeConfig"] diff --git a/src/transformers/models/lfm2_vl/configuration_lfm2_vl.py b/src/transformers/models/lfm2_vl/configuration_lfm2_vl.py index ad02ae4de9a7..7f2ad855d539 100755 --- a/src/transformers/models/lfm2_vl/configuration_lfm2_vl.py +++ b/src/transformers/models/lfm2_vl/configuration_lfm2_vl.py @@ -13,15 +13,15 @@ # limitations under the License. """PyTorch LFM2-VL model.""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="LiquidAI/LFM2-VL-1.6B") +@strict(accept_kwargs=True) class Lfm2VlConfig(PreTrainedConfig): r""" downsample_factor (`int`, *optional*, defaults to 2): @@ -35,43 +35,31 @@ class Lfm2VlConfig(PreTrainedConfig): model_type = "lfm2_vl" sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_id=396, - projector_hidden_act="gelu", - projector_hidden_size=2560, - projector_bias=True, - projector_use_layernorm=True, - downsample_factor=2, - tie_word_embeddings=True, - **kwargs, - ): - self.image_token_id = image_token_id - self.projector_hidden_act = projector_hidden_act - self.projector_hidden_size = projector_hidden_size - self.projector_bias = projector_bias - self.projector_use_layernorm = projector_use_layernorm - self.downsample_factor = downsample_factor - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "siglip2_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["siglip2_vision_model"]() + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_id: int = 396 + projector_hidden_act: str = "gelu" + projector_hidden_size: int = 2560 + projector_bias: bool = True + projector_use_layernorm: bool = True + downsample_factor: int = 2 + tie_word_embeddings: bool = True - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "lfm2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["lfm2"]() + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "siglip2_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["siglip2_vision_model"]() - self.vision_config = vision_config - self.text_config = text_config - self.tie_word_embeddings = getattr(text_config, "tie_embedding", tie_word_embeddings) + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "lfm2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["lfm2"]() - super().__init__(**kwargs) + self.tie_word_embeddings = kwargs.pop("tie_embedding", self.tie_word_embeddings) + super().__post_init__(**kwargs) __all__ = ["Lfm2VlConfig"] diff --git a/src/transformers/models/lightglue/configuration_lightglue.py b/src/transformers/models/lightglue/configuration_lightglue.py index 515f8a850a66..989b77ddef18 100644 --- a/src/transformers/models/lightglue/configuration_lightglue.py +++ b/src/transformers/models/lightglue/configuration_lightglue.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -26,6 +27,7 @@ @auto_docstring(checkpoint="ETH-CVG/lightglue_superpoint") +@strict(accept_kwargs=True) class LightGlueConfig(PreTrainedConfig): r""" keypoint_detector_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SuperPointConfig`): @@ -59,70 +61,50 @@ class LightGlueConfig(PreTrainedConfig): model_type = "lightglue" sub_configs = {"keypoint_detector_config": AutoConfig} - def __init__( - self, - keypoint_detector_config: SuperPointConfig = None, - descriptor_dim: int = 256, - num_hidden_layers: int = 9, - num_attention_heads: int = 4, - num_key_value_heads=None, - depth_confidence: float = 0.95, - width_confidence: float = 0.99, - filter_threshold: float = 0.1, - initializer_range: float = 0.02, - hidden_act: str = "gelu", - attention_dropout=0.0, - attention_bias=True, - trust_remote_code: bool = False, - **kwargs, - ): - # LightGlue can be used with other models than SuperPoint as keypoint detector - # We provide the trust_remote_code argument to allow the use of other models - # that are not registered in the CONFIG_MAPPING dictionary (for example DISK) - self.trust_remote_code = trust_remote_code - - if descriptor_dim % num_attention_heads != 0: - raise ValueError("descriptor_dim % num_heads is different from zero") - - self.descriptor_dim = descriptor_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - - self.depth_confidence = depth_confidence - self.width_confidence = width_confidence - self.filter_threshold = filter_threshold - self.initializer_range = initializer_range + keypoint_detector_config: dict | SuperPointConfig | None = None + descriptor_dim: int = 256 + num_hidden_layers: int = 9 + num_attention_heads: int = 4 + num_key_value_heads: int | None = None + depth_confidence: float = 0.95 + width_confidence: float = 0.99 + filter_threshold: float = 0.1 + initializer_range: float = 0.02 + hidden_act: str = "gelu" + attention_dropout: float | int = 0.0 + attention_bias: bool = True + # LightGlue can be used with other models than SuperPoint as keypoint detector + # We provide the trust_remote_code argument to allow the use of other models + # that are not registered in the CONFIG_MAPPING dictionary (for example DISK) + trust_remote_code: bool = False + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads # Keypoint Detector is forced into eager attention mode because SuperPoint does not have Attention # See https://github.com/huggingface/transformers/pull/31718#discussion_r2109733153 - if isinstance(keypoint_detector_config, dict): - keypoint_detector_config["model_type"] = keypoint_detector_config.get("model_type", "superpoint") - if keypoint_detector_config["model_type"] not in CONFIG_MAPPING: - keypoint_detector_config = AutoConfig.from_pretrained( - keypoint_detector_config["_name_or_path"], trust_remote_code=self.trust_remote_code + if isinstance(self.keypoint_detector_config, dict): + self.keypoint_detector_config["model_type"] = self.keypoint_detector_config.get("model_type", "superpoint") + if self.keypoint_detector_config["model_type"] not in CONFIG_MAPPING: + self.keypoint_detector_config = AutoConfig.from_pretrained( + self.keypoint_detector_config["_name_or_path"], trust_remote_code=self.trust_remote_code ) else: - keypoint_detector_config = CONFIG_MAPPING[keypoint_detector_config["model_type"]]( - **keypoint_detector_config, attn_implementation="eager" + self.keypoint_detector_config = CONFIG_MAPPING[self.keypoint_detector_config["model_type"]]( + **self.keypoint_detector_config, attn_implementation="eager" ) + elif self.keypoint_detector_config is None: + self.keypoint_detector_config = CONFIG_MAPPING["superpoint"](attn_implementation="eager") - if keypoint_detector_config is None: - keypoint_detector_config = CONFIG_MAPPING["superpoint"](attn_implementation="eager") - - self.keypoint_detector_config = keypoint_detector_config + self.intermediate_size = self.descriptor_dim * 2 + self.hidden_size = self.descriptor_dim + super().__post_init__(**kwargs) - self.hidden_size = descriptor_dim - self.intermediate_size = descriptor_dim * 2 - self.hidden_act = hidden_act - self.attention_dropout = attention_dropout - self.attention_bias = attention_bias - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.descriptor_dim % self.num_attention_heads != 0: + raise ValueError("descriptor_dim % num_heads is different from zero") __all__ = ["LightGlueConfig"] diff --git a/src/transformers/models/lightglue/modular_lightglue.py b/src/transformers/models/lightglue/modular_lightglue.py index 4eb43222ca5a..dd34c3698721 100644 --- a/src/transformers/models/lightglue/modular_lightglue.py +++ b/src/transformers/models/lightglue/modular_lightglue.py @@ -16,6 +16,7 @@ import numpy as np import torch +from huggingface_hub.dataclasses import strict from torch import nn from torch.nn.utils.rnn import pad_sequence @@ -41,6 +42,7 @@ @auto_docstring(checkpoint="ETH-CVG/lightglue_superpoint") +@strict(accept_kwargs=True) class LightGlueConfig(PreTrainedConfig): r""" keypoint_detector_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SuperPointConfig`): @@ -74,70 +76,50 @@ class LightGlueConfig(PreTrainedConfig): model_type = "lightglue" sub_configs = {"keypoint_detector_config": AutoConfig} - def __init__( - self, - keypoint_detector_config: SuperPointConfig = None, - descriptor_dim: int = 256, - num_hidden_layers: int = 9, - num_attention_heads: int = 4, - num_key_value_heads=None, - depth_confidence: float = 0.95, - width_confidence: float = 0.99, - filter_threshold: float = 0.1, - initializer_range: float = 0.02, - hidden_act: str = "gelu", - attention_dropout=0.0, - attention_bias=True, - trust_remote_code: bool = False, - **kwargs, - ): - # LightGlue can be used with other models than SuperPoint as keypoint detector - # We provide the trust_remote_code argument to allow the use of other models - # that are not registered in the CONFIG_MAPPING dictionary (for example DISK) - self.trust_remote_code = trust_remote_code - - if descriptor_dim % num_attention_heads != 0: - raise ValueError("descriptor_dim % num_heads is different from zero") - - self.descriptor_dim = descriptor_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - - self.depth_confidence = depth_confidence - self.width_confidence = width_confidence - self.filter_threshold = filter_threshold - self.initializer_range = initializer_range + keypoint_detector_config: dict | SuperPointConfig | None = None + descriptor_dim: int = 256 + num_hidden_layers: int = 9 + num_attention_heads: int = 4 + num_key_value_heads: int | None = None + depth_confidence: float = 0.95 + width_confidence: float = 0.99 + filter_threshold: float = 0.1 + initializer_range: float = 0.02 + hidden_act: str = "gelu" + attention_dropout: float | int = 0.0 + attention_bias: bool = True + # LightGlue can be used with other models than SuperPoint as keypoint detector + # We provide the trust_remote_code argument to allow the use of other models + # that are not registered in the CONFIG_MAPPING dictionary (for example DISK) + trust_remote_code: bool = False + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads # Keypoint Detector is forced into eager attention mode because SuperPoint does not have Attention # See https://github.com/huggingface/transformers/pull/31718#discussion_r2109733153 - if isinstance(keypoint_detector_config, dict): - keypoint_detector_config["model_type"] = keypoint_detector_config.get("model_type", "superpoint") - if keypoint_detector_config["model_type"] not in CONFIG_MAPPING: - keypoint_detector_config = AutoConfig.from_pretrained( - keypoint_detector_config["_name_or_path"], trust_remote_code=self.trust_remote_code + if isinstance(self.keypoint_detector_config, dict): + self.keypoint_detector_config["model_type"] = self.keypoint_detector_config.get("model_type", "superpoint") + if self.keypoint_detector_config["model_type"] not in CONFIG_MAPPING: + self.keypoint_detector_config = AutoConfig.from_pretrained( + self.keypoint_detector_config["_name_or_path"], trust_remote_code=self.trust_remote_code ) else: - keypoint_detector_config = CONFIG_MAPPING[keypoint_detector_config["model_type"]]( - **keypoint_detector_config, attn_implementation="eager" + self.keypoint_detector_config = CONFIG_MAPPING[self.keypoint_detector_config["model_type"]]( + **self.keypoint_detector_config, attn_implementation="eager" ) + elif self.keypoint_detector_config is None: + self.keypoint_detector_config = CONFIG_MAPPING["superpoint"](attn_implementation="eager") - if keypoint_detector_config is None: - keypoint_detector_config = CONFIG_MAPPING["superpoint"](attn_implementation="eager") + self.intermediate_size = self.descriptor_dim * 2 + self.hidden_size = self.descriptor_dim + super().__post_init__(**kwargs) - self.keypoint_detector_config = keypoint_detector_config - - self.hidden_size = descriptor_dim - self.intermediate_size = descriptor_dim * 2 - self.hidden_act = hidden_act - self.attention_dropout = attention_dropout - self.attention_bias = attention_bias - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.descriptor_dim % self.num_attention_heads != 0: + raise ValueError("descriptor_dim % num_heads is different from zero") @dataclass diff --git a/src/transformers/models/lighton_ocr/configuration_lighton_ocr.py b/src/transformers/models/lighton_ocr/configuration_lighton_ocr.py index ce7d633a6f8a..b3c2b8a96b4e 100644 --- a/src/transformers/models/lighton_ocr/configuration_lighton_ocr.py +++ b/src/transformers/models/lighton_ocr/configuration_lighton_ocr.py @@ -17,14 +17,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any -from ...configuration_utils import PretrainedConfig +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig, PretrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="lightonai/LightOnOCR-1B-1025") +@strict(accept_kwargs=True) class LightOnOcrConfig(PretrainedConfig): r""" Example: @@ -46,22 +48,16 @@ class LightOnOcrConfig(PretrainedConfig): model_type = "lighton_ocr" sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - spatial_merge_size: int = 2, - image_token_id: int = 151655, - tie_word_embeddings: bool = True, - vision_config: dict[str, Any] | None = None, - text_config: dict[str, Any] | None = None, - **kwargs, - ): - self.spatial_merge_size = spatial_merge_size - self.image_token_id = image_token_id - self.tie_word_embeddings = tie_word_embeddings + spatial_merge_size: int = 2 + image_token_id: int = 151655 + tie_word_embeddings: bool = True + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None - if vision_config is None: + def __post_init__(self, **kwargs): + if self.vision_config is None: self.vision_config = CONFIG_MAPPING["pixtral"]( - attention_dropout=0, + attention_dropout=0.0, head_dim=64, hidden_act="silu", hidden_size=1024, @@ -75,15 +71,13 @@ def __init__( patch_size=14, rope_theta=10000, ) - elif isinstance(vision_config, PretrainedConfig): - self.vision_config = vision_config - else: - vision_config["model_type"] = vision_config.get("model_type", "pixtral") - self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "pixtral") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) - if text_config is None: + if self.text_config is None: self.text_config = CONFIG_MAPPING["qwen3"]( - attention_dropout=0, + attention_dropout=0.0, head_dim=128, hidden_act="silu", hidden_size=1024, @@ -99,13 +93,11 @@ def __init__( use_cache=True, vocab_size=151936, ) - elif isinstance(text_config, PretrainedConfig): - self.text_config = text_config - else: - text_config["model_type"] = text_config.get("model_type", "qwen3") - self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen3") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["LightOnOcrConfig"] diff --git a/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py b/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py index b1d7a96dfdc4..37aa2f61f0ed 100644 --- a/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py +++ b/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py @@ -17,6 +17,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from dataclasses import dataclass import torch diff --git a/src/transformers/models/lighton_ocr/modular_lighton_ocr.py b/src/transformers/models/lighton_ocr/modular_lighton_ocr.py index c5c73bf06cf7..06995a1ca5f4 100644 --- a/src/transformers/models/lighton_ocr/modular_lighton_ocr.py +++ b/src/transformers/models/lighton_ocr/modular_lighton_ocr.py @@ -11,14 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any import numpy as np import torch +from huggingface_hub.dataclasses import strict from torch import nn from ...cache_utils import Cache -from ...configuration_utils import PretrainedConfig +from ...configuration_utils import PreTrainedConfig, PretrainedConfig from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput from ...modeling_outputs import BaseModelOutputWithPooling @@ -42,6 +42,7 @@ @auto_docstring(checkpoint="lightonai/LightOnOCR-1B-1025") +@strict(accept_kwargs=True) class LightOnOcrConfig(PretrainedConfig): r""" Example: @@ -63,22 +64,16 @@ class LightOnOcrConfig(PretrainedConfig): model_type = "lighton_ocr" sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - spatial_merge_size: int = 2, - image_token_id: int = 151655, - tie_word_embeddings: bool = True, - vision_config: dict[str, Any] | None = None, - text_config: dict[str, Any] | None = None, - **kwargs, - ): - self.spatial_merge_size = spatial_merge_size - self.image_token_id = image_token_id - self.tie_word_embeddings = tie_word_embeddings + spatial_merge_size: int = 2 + image_token_id: int = 151655 + tie_word_embeddings: bool = True + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None - if vision_config is None: + def __post_init__(self, **kwargs): + if self.vision_config is None: self.vision_config = CONFIG_MAPPING["pixtral"]( - attention_dropout=0, + attention_dropout=0.0, head_dim=64, hidden_act="silu", hidden_size=1024, @@ -92,15 +87,13 @@ def __init__( patch_size=14, rope_theta=10000, ) - elif isinstance(vision_config, PretrainedConfig): - self.vision_config = vision_config - else: - vision_config["model_type"] = vision_config.get("model_type", "pixtral") - self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "pixtral") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) - if text_config is None: + if self.text_config is None: self.text_config = CONFIG_MAPPING["qwen3"]( - attention_dropout=0, + attention_dropout=0.0, head_dim=128, hidden_act="silu", hidden_size=1024, @@ -116,13 +109,11 @@ def __init__( use_cache=True, vocab_size=151936, ) - elif isinstance(text_config, PretrainedConfig): - self.text_config = text_config - else: - text_config["model_type"] = text_config.get("model_type", "qwen3") - self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen3") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) - super().__init__(**kwargs) + super().__post_init__(**kwargs) class LightOnOcrProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/models/lilt/configuration_lilt.py b/src/transformers/models/lilt/configuration_lilt.py index 223c98643583..998dc879c7a4 100644 --- a/src/transformers/models/lilt/configuration_lilt.py +++ b/src/transformers/models/lilt/configuration_lilt.py @@ -13,14 +13,14 @@ # limitations under the License. """LiLT configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="SCUT-DLVCLab/lilt-roberta-en-base") +@strict(accept_kwargs=True) class LiltConfig(PreTrainedConfig): r""" channel_shrink_ratio (`int`, *optional*, defaults to 4): @@ -44,48 +44,24 @@ class LiltConfig(PreTrainedConfig): model_type = "lilt" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=None, - eos_token_id=None, - classifier_dropout=None, - channel_shrink_ratio=4, - max_2d_position_embeddings=1024, - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.classifier_dropout = classifier_dropout - self.channel_shrink_ratio = channel_shrink_ratio - self.max_2d_position_embeddings = max_2d_position_embeddings + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + classifier_dropout: float | int | None = None + channel_shrink_ratio: int = 4 + max_2d_position_embeddings: int = 1024 __all__ = ["LiltConfig"] diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py index 8961f391c5af..ca48a9fe8760 100644 --- a/src/transformers/models/lilt/modeling_lilt.py +++ b/src/transformers/models/lilt/modeling_lilt.py @@ -572,7 +572,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -701,7 +701,7 @@ def forward( >>> predicted_class_idx = outputs.logits.argmax(-1).item() >>> predicted_class = model.config.id2label[predicted_class_idx] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.lilt( input_ids, @@ -814,7 +814,7 @@ def forward( >>> outputs = model(**encoding) >>> predicted_class_indices = outputs.logits.argmax(-1) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.lilt( input_ids, @@ -935,7 +935,7 @@ def forward( >>> predict_answer_tokens = encoding.input_ids[0, answer_start_index : answer_end_index + 1] >>> predicted_answer = tokenizer.decode(predict_answer_tokens) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.lilt( input_ids, diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 324208450b42..0e8ad44bf33e 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -18,12 +18,16 @@ # limitations under the License. """LLaMA model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring +from ...utils.type_validators import interval @auto_docstring(checkpoint="meta-llama/Llama-2-7b-hf") +@strict(accept_kwargs=True) class LlamaConfig(PreTrainedConfig): r""" ```python @@ -57,59 +61,43 @@ class LlamaConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - pretraining_tp: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - head_dim: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = interval(min=0.0, max=1.0)(default=0.02) + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + pretraining_tp: int | None = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + mlp_bias: bool = False + head_dim: int | None = None - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + def __post_init__(self, **kwargs): + if self.head_dim is None: + self.head_dim = self.hidden_size // self.num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters + super().__post_init__(**kwargs) - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) __all__ = ["LlamaConfig"] diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 9cb2eb3f8108..10a3361861db 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -14,7 +14,9 @@ # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="meta-llama/Llama-4-Scout-17B-16E") +@strict(accept_kwargs=True) class Llama4VisionConfig(PreTrainedConfig): r""" vision_output_dim (`int`, *optional*, defaults to 7680): @@ -50,54 +53,29 @@ class Llama4VisionConfig(PreTrainedConfig): model_type = "llama4_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size: int | None = 768, - hidden_act: str | None = "gelu", - num_hidden_layers: int | None = 34, - num_attention_heads: int | None = 16, - num_channels: int | None = 3, - intermediate_size: int | None = 5632, - vision_output_dim: int | None = 7680, - image_size: int | None = 448, - patch_size: int | None = 14, - norm_eps: float | None = 1e-5, - vision_feature_select_strategy: str | None = "default", - initializer_range: float | None = 0.02, - pixel_shuffle_ratio: float | None = 0.5, - projector_input_dim: int | None = 4096, - projector_output_dim: int | None = 4096, - multi_modal_projector_bias: bool | None = False, - projector_dropout: float | None = 0.0, - attention_dropout: float | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.num_hidden_layers = num_hidden_layers - self.num_channels = num_channels - self.intermediate_size = intermediate_size - self.image_size = image_size - self.vision_output_dim = vision_output_dim - self.patch_size = patch_size - self.norm_eps = norm_eps - self.num_attention_heads = num_attention_heads - self.initializer_range = initializer_range - self.pixel_shuffle_ratio = pixel_shuffle_ratio - self.projector_input_dim = projector_input_dim - self.projector_output_dim = projector_output_dim - self.multi_modal_projector_bias = multi_modal_projector_bias - self.projector_dropout = projector_dropout - self.attention_dropout = attention_dropout - self.vision_feature_select_strategy = vision_feature_select_strategy - - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + hidden_size: int = 768 + hidden_act: str = "gelu" + num_hidden_layers: int = 34 + num_attention_heads: int = 16 + num_channels: int = 3 + intermediate_size: int = 5632 + vision_output_dim: int = 7680 + image_size: int | list[int] | tuple[int, int] = 448 + patch_size: int | list[int] | tuple[int, int] = 14 + norm_eps: float = 1e-5 + vision_feature_select_strategy: str = "default" + initializer_range: float = 0.02 + pixel_shuffle_ratio: float = 0.5 + projector_input_dim: int = 4096 + projector_output_dim: int = 4096 + multi_modal_projector_bias: bool = False + projector_dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + rope_parameters: RopeParameters | dict | None = None @auto_docstring(checkpoint="meta-llama/Llama-4-Scout-17B-16E") +@strict(accept_kwargs=True) class Llama4TextConfig(PreTrainedConfig): r""" intermediate_size_mlp (`int`, *optional*, defaults to 16384): @@ -158,114 +136,74 @@ class Llama4TextConfig(PreTrainedConfig): "layers.*.feed_forward.router": "ep_router", } - def __init__( - self, - vocab_size=202048, - hidden_size=5120, - intermediate_size=8192, - intermediate_size_mlp=16384, - num_hidden_layers=48, - num_attention_heads=40, - num_key_value_heads=8, - head_dim=128, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - attention_dropout=0.0, - num_experts_per_tok=1, - num_local_experts=16, - moe_layers=None, - interleave_moe_layer_step=1, - use_qk_norm=True, - output_router_logits=False, - router_aux_loss_coef=0.001, - router_jitter_noise=0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - no_rope_layers=None, - no_rope_layer_interval=4, - attention_chunk_size=8192, - layer_types=None, - attn_temperature_tuning=True, - floor_scale=8192, - attn_scale=0.1, - **kwargs, - ): - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.attn_temperature_tuning = attn_temperature_tuning - self.attn_scale = attn_scale - self.floor_scale = floor_scale - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.intermediate_size_mlp = intermediate_size_mlp - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.attention_bias = False - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.use_qk_norm = use_qk_norm - self.num_experts_per_tok = num_experts_per_tok - self.num_local_experts = num_local_experts - - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.router_jitter_noise = router_jitter_noise - - # Backwards compatibility - if no_rope_layers == []: - no_rope_layers = None + vocab_size: int = 202048 + hidden_size: int = 5120 + intermediate_size: int = 8192 + intermediate_size_mlp: int = 16384 + num_hidden_layers: int = 48 + num_attention_heads: int = 40 + num_key_value_heads: int = 8 + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 4096 * 32 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + tie_word_embeddings: bool = False + attention_dropout: float | int = 0.0 + num_experts_per_tok: int = 1 + num_local_experts: int = 16 + moe_layers: list[int] | None = None + interleave_moe_layer_step: int = 1 + use_qk_norm: bool = True + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + router_jitter_noise: float = 0.0 + rope_parameters: RopeParameters | dict | None = None + no_rope_layers: list[int] | None = None + no_rope_layer_interval: int = 4 + attention_chunk_size: int = 8192 + layer_types: list[int] | None = None + attn_temperature_tuning: bool = True + floor_scale: int = 8192 + attn_scale: float = 0.1 + attention_bias: bool = False + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads default_no_rope_layers = [ - int((layer_idx + 1) % no_rope_layer_interval != 0) for layer_idx in range(self.num_hidden_layers) + int((layer_idx + 1) % self.no_rope_layer_interval != 0) for layer_idx in range(self.num_hidden_layers) ] + self.no_rope_layers = self.no_rope_layers if self.no_rope_layers else default_no_rope_layers + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads - self.no_rope_layers = no_rope_layers if no_rope_layers else default_no_rope_layers - - self.interleave_moe_layer_step = interleave_moe_layer_step self.moe_layers = ( - moe_layers - if moe_layers is not None + self.moe_layers + if self.moe_layers is not None else list( range( - interleave_moe_layer_step - 1, - num_hidden_layers, - interleave_moe_layer_step, + self.interleave_moe_layer_step - 1, + self.num_hidden_layers, + self.interleave_moe_layer_step, ) ) ) - self.attention_chunk_size = attention_chunk_size - self.layer_types = layer_types - if layer_types is None: + if self.layer_types is None: self.layer_types = [ "chunked_attention" if no_rope else "full_attention" for no_rope in self.no_rope_layers ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="meta-llama/Llama-4-Scout-17B-16E") +@strict(accept_kwargs=True) class Llama4Config(PreTrainedConfig): r""" boi_token_index (`int`, *optional*, defaults to 200080): @@ -297,37 +235,26 @@ class Llama4Config(PreTrainedConfig): "multi_modal_projector.linear_1": "colwise_rep", } - def __init__( - self, - vision_config=None, - text_config=None, - boi_token_index=200080, - eoi_token_index=200081, - image_token_index=200092, - tie_word_embeddings=False, - **kwargs, - ): - if vision_config is None: + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + boi_token_index: int = 200080 + eoi_token_index: int = 200081 + image_token_index: int = 200092 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if self.vision_config is None: self.vision_config = Llama4VisionConfig() logger.info("vision_config is None, using default llama4 vision config") - elif isinstance(vision_config, dict): - self.vision_config = Llama4VisionConfig(**vision_config) - elif isinstance(vision_config, Llama4VisionConfig): - self.vision_config = vision_config + elif isinstance(self.vision_config, dict): + self.vision_config = Llama4VisionConfig(**self.vision_config) - self.boi_token_index = boi_token_index - self.eoi_token_index = eoi_token_index - self.image_token_index = image_token_index - if text_config is None: + if self.text_config is None: self.text_config = Llama4TextConfig() logger.info("text_config is None, using default llama4 text config") - elif isinstance(text_config, dict): - self.text_config = Llama4TextConfig(**text_config) - elif isinstance(text_config, Llama4TextConfig): - self.text_config = text_config - - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + elif isinstance(self.text_config, dict): + self.text_config = Llama4TextConfig(**self.text_config) + super().__post_init__(**kwargs) __all__ = ["Llama4Config", "Llama4TextConfig", "Llama4VisionConfig"] diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py index 4cbea1b98e91..314d19b75636 100644 --- a/src/transformers/models/llama4/modeling_llama4.py +++ b/src/transformers/models/llama4/modeling_llama4.py @@ -949,7 +949,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -1105,7 +1105,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # num_concurrent_media and num_chunks are both currently 1 batch_size_times_num_tiles, num_channels, height, width = pixel_values.shape @@ -1302,7 +1302,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index e9b0466fef77..caa3f396580a 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -12,15 +12,17 @@ # limitations under the License. """Llava model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging -from ..auto import CONFIG_MAPPING, AutoConfig +from typing import Literal +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring +from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="llava-hf/llava-1.5-7b-hf") +@strict(accept_kwargs=True) class LlavaConfig(PreTrainedConfig): r""" Example: @@ -50,38 +52,22 @@ class LlavaConfig(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_index=32000, - projector_hidden_act="gelu", - vision_feature_select_strategy="default", - vision_feature_layer=-2, - image_seq_length=576, - multimodal_projector_bias=True, - tie_word_embeddings=False, - **kwargs, - ): - self.image_token_index = image_token_index - self.projector_hidden_act = projector_hidden_act - self.image_seq_length = image_seq_length - self.tie_word_embeddings = tie_word_embeddings - - if vision_feature_select_strategy not in ["default", "full"]: - raise ValueError( - "vision_feature_select_strategy should be one of 'default', 'full'." - f"Got: {vision_feature_select_strategy}" - ) - - self.vision_feature_select_strategy = vision_feature_select_strategy - self.vision_feature_layer = vision_feature_layer - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["clip_vision_model"]( + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 32000 + image_seq_length: int = 576 + projector_hidden_act: str = "gelu" + vision_feature_select_strategy: Literal["default", "full"] = "default" + vision_feature_layer: int | list[int] = -2 + multimodal_projector_bias: bool = True + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "clip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["clip_vision_model"]( intermediate_size=4096, hidden_size=1024, patch_size=14, @@ -92,24 +78,19 @@ def __init__( projection_dim=768, ) - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["llama"]() - - self.text_config = text_config - self.multimodal_projector_bias = multimodal_projector_bias + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["llama"]() # The default value is `False` but this config is used with many model types # Attr `tie_word_embeddings` was saved in text config for those models, so we # need an ugly workaround and forward-pass the attr from text config - if not tie_word_embeddings and self.text_config.tie_word_embeddings: + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: self.tie_word_embeddings = self.text_config.tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["LlavaConfig"] diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 6ed4af68a165..2c21dc4d80bf 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -154,7 +154,7 @@ def set_input_embeddings(self, value): def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -232,7 +232,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], @@ -307,7 +307,7 @@ def get_output_embeddings(self) -> nn.Module: def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -328,7 +328,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, labels: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index e09a2f213be7..59cb4cab2ff2 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -12,15 +12,17 @@ # limitations under the License. """Llava-NeXT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging -from ..auto import CONFIG_MAPPING, AutoConfig +from typing import Literal +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring +from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="llava-hf/llava-v1.6-mistral-7b-hf") +@strict(accept_kwargs=True) class LlavaNextConfig(PreTrainedConfig): r""" image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`): @@ -49,50 +51,26 @@ class LlavaNextConfig(PreTrainedConfig): ```""" model_type = "llava_next" - attribute_map = { - "image_token_id": "image_token_index", - } + attribute_map = {"image_token_id": "image_token_index"} sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_index=32000, - projector_hidden_act="gelu", - vision_feature_select_strategy="default", - vision_feature_layer=-2, - image_grid_pinpoints=None, - tie_word_embeddings=False, - image_seq_length=576, - multimodal_projector_bias=True, - **kwargs, - ): - self.image_token_index = image_token_index - self.projector_hidden_act = projector_hidden_act - self.image_seq_length = image_seq_length - self.multimodal_projector_bias = multimodal_projector_bias - - if vision_feature_select_strategy not in ["default", "full"]: - raise ValueError( - "vision_feature_select_strategy should be one of 'default', 'full'." - f"Got: {vision_feature_select_strategy}" - ) - - self.vision_feature_select_strategy = vision_feature_select_strategy - self.vision_feature_layer = vision_feature_layer - image_grid_pinpoints = ( - image_grid_pinpoints - if image_grid_pinpoints is not None - else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] - ) - self.image_grid_pinpoints = image_grid_pinpoints - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["clip_vision_model"]( + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 32000 + projector_hidden_act: str = "gelu" + vision_feature_select_strategy: Literal["default", "full"] = "default" + vision_feature_layer: int | list[int] = -2 + multimodal_projector_bias: bool = True + tie_word_embeddings: bool = False + image_grid_pinpoints: list | None = None + image_seq_length: int = 576 + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "clip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["clip_vision_model"]( intermediate_size=4096, hidden_size=1024, patch_size=14, @@ -103,18 +81,19 @@ def __init__( projection_dim=768, ) - self.vision_config = vision_config + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["llama"]() - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["llama"]() - - self.text_config = text_config + self.image_grid_pinpoints = ( + self.image_grid_pinpoints + if self.image_grid_pinpoints is not None + else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] + ) - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["LlavaNextConfig"] diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index d721009f38bd..56e7a83bf20a 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -353,7 +353,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -454,7 +454,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, @@ -550,7 +550,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -587,7 +587,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 8af3246325fc..09474c557979 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -18,12 +18,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Literal + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="llava-hf/LLaVA-NeXT-Video-7B-hf") +@strict(accept_kwargs=True) class LlavaNextVideoConfig(PreTrainedConfig): r""" image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`): @@ -60,54 +65,27 @@ class LlavaNextVideoConfig(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_index=32001, - projector_hidden_act="gelu", - multimodal_projector_bias=True, - vision_feature_select_strategy="default", - vision_feature_layer=-2, - image_grid_pinpoints=None, - video_token_index=32000, - spatial_pool_mode="average", - spatial_pool_stride=2, - image_seq_length=576, - video_seq_length=288, - tie_word_embeddings=False, - **kwargs, - ): - self.video_token_index = video_token_index - self.spatial_pool_mode = spatial_pool_mode - self.spatial_pool_stride = spatial_pool_stride - self.image_seq_length = image_seq_length - self.video_seq_length = video_seq_length - self.image_token_index = image_token_index - self.projector_hidden_act = projector_hidden_act - self.multimodal_projector_bias = multimodal_projector_bias - self.tie_word_embeddings = tie_word_embeddings - - if vision_feature_select_strategy not in ["default", "full"]: - raise ValueError( - "vision_feature_select_strategy should be one of 'default', 'full'." - f"Got: {vision_feature_select_strategy}" - ) - - self.vision_feature_select_strategy = vision_feature_select_strategy - self.vision_feature_layer = vision_feature_layer - image_grid_pinpoints = ( - image_grid_pinpoints - if image_grid_pinpoints is not None - else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] - ) - self.image_grid_pinpoints = image_grid_pinpoints - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["clip_vision_model"]( + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 32001 + video_token_index: int = 32000 + projector_hidden_act: str = "gelu" + vision_feature_select_strategy: Literal["default", "full"] = "default" + vision_feature_layer: int | list[int] = -2 + multimodal_projector_bias: bool = True + tie_word_embeddings: bool = False + image_grid_pinpoints: list | None = None + spatial_pool_mode: str = "average" + spatial_pool_stride: int = 2 + image_seq_length: int = 576 + video_seq_length: int = 288 + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "clip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["clip_vision_model"]( intermediate_size=4096, hidden_size=1024, patch_size=14, @@ -118,23 +96,25 @@ def __init__( projection_dim=768, ) - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["llama"]() + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["llama"]() - self.text_config = text_config + self.image_grid_pinpoints = ( + self.image_grid_pinpoints + if self.image_grid_pinpoints is not None + else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] + ) # The default value is `False` but this config is used with many model types # Attr `tie_word_embeddings` was saved in text config for those models, so we # need an ugly workaround and forward-pass the attr from text config - if not tie_word_embeddings and self.text_config.tie_word_embeddings: + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: self.tie_word_embeddings = self.text_config.tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["LlavaNextVideoConfig"] diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 1305febb56ce..c4c0058af986 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -409,7 +409,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -527,7 +527,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, use_cache: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -604,7 +604,7 @@ def forward( def get_video_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -694,7 +694,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -732,7 +732,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, @@ -882,7 +882,7 @@ def prepare_inputs_for_generation( def get_video_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index b8143ce9fdd8..fae2d41b89a0 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -13,8 +13,10 @@ # limitations under the License. import math +from typing import Literal import torch +from huggingface_hub.dataclasses import strict from torch import nn from transformers.models.llava_next.modeling_llava_next import ( @@ -42,6 +44,7 @@ @auto_docstring(checkpoint="llava-hf/LLaVA-NeXT-Video-7B-hf") +@strict(accept_kwargs=True) class LlavaNextVideoConfig(PreTrainedConfig): r""" image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`): @@ -78,54 +81,27 @@ class LlavaNextVideoConfig(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_index=32001, - projector_hidden_act="gelu", - multimodal_projector_bias=True, - vision_feature_select_strategy="default", - vision_feature_layer=-2, - image_grid_pinpoints=None, - video_token_index=32000, - spatial_pool_mode="average", - spatial_pool_stride=2, - image_seq_length=576, - video_seq_length=288, - tie_word_embeddings=False, - **kwargs, - ): - self.video_token_index = video_token_index - self.spatial_pool_mode = spatial_pool_mode - self.spatial_pool_stride = spatial_pool_stride - self.image_seq_length = image_seq_length - self.video_seq_length = video_seq_length - self.image_token_index = image_token_index - self.projector_hidden_act = projector_hidden_act - self.multimodal_projector_bias = multimodal_projector_bias - self.tie_word_embeddings = tie_word_embeddings - - if vision_feature_select_strategy not in ["default", "full"]: - raise ValueError( - "vision_feature_select_strategy should be one of 'default', 'full'." - f"Got: {vision_feature_select_strategy}" - ) - - self.vision_feature_select_strategy = vision_feature_select_strategy - self.vision_feature_layer = vision_feature_layer - image_grid_pinpoints = ( - image_grid_pinpoints - if image_grid_pinpoints is not None - else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] - ) - self.image_grid_pinpoints = image_grid_pinpoints - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["clip_vision_model"]( + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 32001 + video_token_index: int = 32000 + projector_hidden_act: str = "gelu" + vision_feature_select_strategy: Literal["default", "full"] = "default" + vision_feature_layer: int | list[int] = -2 + multimodal_projector_bias: bool = True + tie_word_embeddings: bool = False + image_grid_pinpoints: list | None = None + spatial_pool_mode: str = "average" + spatial_pool_stride: int = 2 + image_seq_length: int = 576 + video_seq_length: int = 288 + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "clip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["clip_vision_model"]( intermediate_size=4096, hidden_size=1024, patch_size=14, @@ -136,23 +112,25 @@ def __init__( projection_dim=768, ) - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["llama"]() + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["llama"]() - self.text_config = text_config + self.image_grid_pinpoints = ( + self.image_grid_pinpoints + if self.image_grid_pinpoints is not None + else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] + ) # The default value is `False` but this config is used with many model types # Attr `tie_word_embeddings` was saved in text config for those models, so we # need an ugly workaround and forward-pass the attr from text config - if not tie_word_embeddings and self.text_config.tie_word_embeddings: + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: self.tie_word_embeddings = self.text_config.tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) class LlavaNextVideoModelOutputWithPast(LlavaNextModelOutputWithPast): @@ -256,7 +234,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -328,7 +306,7 @@ def get_image_features( def get_video_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -426,7 +404,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, use_cache: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -497,7 +475,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): def get_video_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -532,7 +510,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index 7b55c6dac43a..b477c8c7c210 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -12,16 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Literal + +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="llava-hf/llava-onevision-qwen2-7b-ov-hf") +@strict(accept_kwargs=True) class LlavaOnevisionConfig(PreTrainedConfig): r""" vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`): @@ -58,39 +59,42 @@ class LlavaOnevisionConfig(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_index=151646, - video_token_index=151647, - projector_hidden_act="gelu", - vision_feature_select_strategy="full", - vision_feature_layer=-1, - vision_aspect_ratio="anyres_max_9", - image_grid_pinpoints=None, - multimodal_projector_bias=True, - tie_word_embeddings=False, - **kwargs, - ): - self.image_token_index = image_token_index - self.video_token_index = video_token_index - self.projector_hidden_act = projector_hidden_act - self.multimodal_projector_bias = multimodal_projector_bias - self.tie_word_embeddings = tie_word_embeddings - - if vision_feature_select_strategy not in ["default", "full"]: - raise ValueError( - "vision_feature_select_strategy should be one of 'default', 'full'." - f"Got: {vision_feature_select_strategy}" + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 151646 + video_token_index: int = 151647 + projector_hidden_act: str = "gelu" + vision_feature_select_strategy: Literal["default", "full"] = "full" + vision_feature_layer: int | list[int] = -1 + multimodal_projector_bias: bool = True + tie_word_embeddings: bool = False + image_grid_pinpoints: list | None = None + vision_aspect_ratio: str = "anyres_max_9" + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "siglip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["siglip_vision_model"]( + hidden_size=1152, + intermediate_size=4304, + patch_size=14, + image_size=384, + num_hidden_layers=26, + num_attention_heads=16, + vision_use_head=False, ) - self.vision_feature_select_strategy = vision_feature_select_strategy - self.vision_feature_layer = vision_feature_layer - self.vision_aspect_ratio = vision_aspect_ratio - image_grid_pinpoints = ( - image_grid_pinpoints - if image_grid_pinpoints is not None + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["qwen2"]() + + self.image_grid_pinpoints = ( + self.image_grid_pinpoints + if self.image_grid_pinpoints is not None else [ [384, 384], [384, 768], @@ -130,39 +134,14 @@ def __init__( [2304, 2304], ] ) - self.image_grid_pinpoints = image_grid_pinpoints - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["siglip_vision_model"]( - hidden_size=1152, - intermediate_size=4304, - patch_size=14, - image_size=384, - num_hidden_layers=26, - num_attention_heads=16, - vision_use_head=False, - ) - - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "qwen2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["qwen2"]() - - self.text_config = text_config # The default value is `False` but this config is used with many model types # Attr `tie_word_embeddings` was saved in text config for those models, so we # need an ugly workaround and forward-pass the attr from text config - if not tie_word_embeddings and self.text_config.tie_word_embeddings: + if not self.tie_word_embeddings and self.text_config.tie_word_embeddings: self.tie_word_embeddings = self.text_config.tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["LlavaOnevisionConfig"] diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index b01d3d00edf2..c6ac332e38d5 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -366,7 +366,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, vision_aspect_ratio: str | None = None, batch_num_images: torch.LongTensor | None = None, @@ -489,7 +489,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, vision_aspect_ratio: str | None = None, batch_num_images: torch.LongTensor | None = None, @@ -571,7 +571,7 @@ def forward( def get_video_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -674,7 +674,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, vision_aspect_ratio: str | None = None, batch_num_images: torch.LongTensor | None = None, @@ -712,7 +712,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, vision_aspect_ratio: str | None = None, batch_num_images: torch.LongTensor | None = None, @@ -849,7 +849,7 @@ def prepare_inputs_for_generation( def get_video_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py index cb535b276ca2..755c7f2fb491 100644 --- a/src/transformers/models/llava_onevision/modular_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py @@ -319,7 +319,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, vision_aspect_ratio: str | None = None, batch_num_images: torch.LongTensor | None = None, @@ -395,7 +395,7 @@ def get_image_features( def get_video_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -452,7 +452,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, vision_aspect_ratio: str | None = None, batch_num_images: torch.LongTensor | None = None, @@ -542,7 +542,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, vision_aspect_ratio: str | None = None, batch_num_images: torch.LongTensor | None = None, @@ -680,7 +680,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, vision_aspect_ratio: str | None = None, batch_num_images: torch.LongTensor | None = None, diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index fc8c79bb002d..d49c1778f08c 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -14,12 +14,15 @@ """LongCat Flash model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="meituan-longcat/LongCat-Flash-Chat") +@strict(accept_kwargs=True) class LongcatFlashConfig(PreTrainedConfig): r""" ffn_hidden_size (`int`, *optional*, defaults to 12288): @@ -69,85 +72,48 @@ class LongcatFlashConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 131072, - hidden_size: int | None = 6144, - num_hidden_layers: int | None = 56, - num_layers: int | None = 28, - num_attention_heads: int | None = 64, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - ffn_hidden_size: int | None = 12288, - q_lora_rank: int | None = 1536, - kv_lora_rank: int | None = 512, - qk_nope_head_dim: int | None = 128, - qk_rope_head_dim: int | None = 64, - head_dim: int | None = 64, - v_head_dim: int | None = 128, - qk_head_dim: int | None = None, - moe_topk: int | None = 12, - n_routed_experts: int | None = 512, - zero_expert_num: int | None = 256, - expert_ffn_hidden_size: int | None = 2048, - routed_scaling_factor: float | None = 6.0, - **kwargs, - ): - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - if qk_head_dim is None: - qk_head_dim = qk_nope_head_dim + qk_rope_head_dim - - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_layers = num_layers - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - self.ffn_hidden_size = ffn_hidden_size - - self.q_lora_rank = q_lora_rank - self.kv_lora_rank = kv_lora_rank - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.qk_head_dim = qk_head_dim - self.head_dim = head_dim - - self.moe_topk = moe_topk - self.n_routed_experts = n_routed_experts - self.zero_expert_num = zero_expert_num - self.expert_ffn_hidden_size = expert_ffn_hidden_size - self.routed_scaling_factor = routed_scaling_factor - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) - - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None = None, **kwargs): + vocab_size: int = 131072 + hidden_size: int = 6144 + num_hidden_layers: int = 56 + num_layers: int = 28 + num_attention_heads: int = 64 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + ffn_hidden_size: int = 12288 + q_lora_rank: int = 1536 + kv_lora_rank: int = 512 + qk_nope_head_dim: int = 128 + qk_rope_head_dim: int = 64 + head_dim: int = 64 + v_head_dim: int = 128 + qk_head_dim: int | None = None + moe_topk: int = 12 + n_routed_experts: int = 512 + zero_expert_num: int = 256 + expert_ffn_hidden_size: int = 2048 + routed_scaling_factor: float = 6.0 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + if self.qk_head_dim is None: + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + + super().__post_init__(**kwargs) + + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} @@ -155,7 +121,6 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None # Standardize and validate the correctness of rotary position embeddings parameters self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) # Convert to float because RoPE fn expect a float. Models on the hub were saved as int for key in ["beta_fast", "beta_slow", "factor"]: diff --git a/src/transformers/models/longformer/configuration_longformer.py b/src/transformers/models/longformer/configuration_longformer.py index 8bca538c18ae..7a8d6f74ab4c 100644 --- a/src/transformers/models/longformer/configuration_longformer.py +++ b/src/transformers/models/longformer/configuration_longformer.py @@ -13,14 +13,14 @@ # limitations under the License. """Longformer configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="allenai/longformer-base-4096") +@strict(accept_kwargs=True) class LongformerConfig(PreTrainedConfig): r""" attention_window (`int` or `list[int]`, *optional*, defaults to 512): @@ -46,50 +46,25 @@ class LongformerConfig(PreTrainedConfig): model_type = "longformer" - def __init__( - self, - attention_window: list[int] | int = 512, - sep_token_id: int = 2, - pad_token_id: int = 1, - bos_token_id: int = 0, - eos_token_id: int = 2, - vocab_size: int = 30522, - hidden_size: int = 768, - num_hidden_layers: int = 12, - num_attention_heads: int = 12, - intermediate_size: int = 3072, - hidden_act: str = "gelu", - hidden_dropout_prob: float = 0.1, - attention_probs_dropout_prob: float = 0.1, - max_position_embeddings: int = 512, - type_vocab_size: int = 2, - initializer_range: float = 0.02, - layer_norm_eps: float = 1e-12, - onnx_export: bool = False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - self.attention_window = attention_window - self.sep_token_id = sep_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.onnx_export = onnx_export + attention_window: list[int] | int = 512 + sep_token_id: int | None = 2 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + onnx_export: bool = False + tie_word_embeddings: bool = True __all__ = ["LongformerConfig"] diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py index 67b950c23a8f..4078b87bbdb9 100755 --- a/src/transformers/models/longformer/modeling_longformer.py +++ b/src/transformers/models/longformer/modeling_longformer.py @@ -1466,7 +1466,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1611,7 +1611,7 @@ def forward( ['healthy', 'skinny', 'thin', 'good', 'vegetarian'] ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.longformer( input_ids, @@ -1696,7 +1696,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if global_attention_mask is None: logger.warning_once("Initializing global attention on CLS token...") @@ -1842,7 +1842,7 @@ def forward( ... tokenizer.convert_tokens_to_ids(answer_tokens) ... ) # remove space prepending space token ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if global_attention_mask is None: if input_ids is None: @@ -1946,7 +1946,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.longformer( input_ids, @@ -2053,7 +2053,7 @@ def forward( model's internal embedding lookup matrix. """ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # set global attention on question tokens if global_attention_mask is None and input_ids is not None: diff --git a/src/transformers/models/longt5/configuration_longt5.py b/src/transformers/models/longt5/configuration_longt5.py index 0b44f9a29d37..aa5cd2a75e80 100644 --- a/src/transformers/models/longt5/configuration_longt5.py +++ b/src/transformers/models/longt5/configuration_longt5.py @@ -13,14 +13,14 @@ # limitations under the License. """LongT5 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/long-t5-local-base") +@strict(accept_kwargs=True) class LongT5Config(PreTrainedConfig): r""" d_ff (`int`, *optional*, defaults to 2048): @@ -53,73 +53,50 @@ class LongT5Config(PreTrainedConfig): "head_dim": "d_kv", } - def __init__( - self, - vocab_size=32128, - d_model=512, - d_kv=64, - d_ff=2048, - num_layers=6, - num_decoder_layers=None, - num_heads=8, - local_radius=127, - global_block_size=16, - relative_attention_num_buckets=32, - relative_attention_max_distance=128, - dropout_rate=0.1, - layer_norm_epsilon=1e-6, - initializer_factor=1.0, - feed_forward_proj="relu", - is_encoder_decoder=True, - encoder_attention_type="local", - use_cache=True, - pad_token_id=0, - eos_token_id=1, - is_decoder=False, - bos_token_id=None, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.vocab_size = vocab_size - self.d_model = d_model - self.d_kv = d_kv - self.d_ff = d_ff - self.num_layers = num_layers - # default = symmetry - self.num_decoder_layers = num_decoder_layers if num_decoder_layers is not None else self.num_layers - self.num_heads = num_heads - self.local_radius = local_radius - self.global_block_size = global_block_size - self.relative_attention_num_buckets = relative_attention_num_buckets - self.relative_attention_max_distance = relative_attention_max_distance - self.dropout_rate = dropout_rate - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_factor = initializer_factor - self.feed_forward_proj = feed_forward_proj - self.encoder_attention_type = encoder_attention_type - self.use_cache = use_cache - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings + vocab_size: int = 32128 + d_model: int = 512 + d_kv: int = 64 + d_ff: int = 2048 + num_layers: int = 6 + num_decoder_layers: int | None = None + num_heads: int = 8 + local_radius: int = 127 + global_block_size: int = 16 + relative_attention_num_buckets: int = 32 + relative_attention_max_distance: int = 128 + dropout_rate: float = 0.1 + layer_norm_epsilon: float = 1e-6 + initializer_factor: float = 1.0 + feed_forward_proj: str = "relu" + is_encoder_decoder: bool = True + encoder_attention_type: str = "local" + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | None = 1 + bos_token_id: int | None = None + is_decoder: bool = False + tie_word_embeddings: bool = True + def __post_init__(self, **kwargs): + self.num_decoder_layers = self.num_decoder_layers if self.num_decoder_layers is not None else self.num_layers act_info = self.feed_forward_proj.split("-") self.dense_act_fn = act_info[-1] self.is_gated_act = act_info[0] == "gated" + if self.feed_forward_proj == "gated-gelu": + self.dense_act_fn = "gelu_new" + + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + act_info = self.feed_forward_proj.split("-") if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2: raise ValueError( - f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer. " + f"`feed_forward_proj`: {self.feed_forward_proj} is not a valid activation function of the dense layer. " "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. " "'gated-gelu' or 'relu'" ) - # for backwards compatibility - if feed_forward_proj == "gated-gelu": - self.dense_act_fn = "gelu_new" - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - __all__ = ["LongT5Config"] diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index 46cf7ff8a94f..b178e50dc352 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -1262,7 +1262,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: err_msg_prefix = "decoder_" if self.is_decoder else "" @@ -1506,7 +1506,7 @@ def forward( >>> last_hidden_states = outputs.last_hidden_state ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1671,7 +1671,7 @@ def forward( abstractthe aim of this article is to provide an overview of the literature on the role of dog ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1805,7 +1805,7 @@ def forward( >>> outputs = model(input_ids=input_ids) >>> last_hidden_states = outputs.last_hidden_state ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict encoder_outputs = self.encoder( input_ids=input_ids, diff --git a/src/transformers/models/luke/configuration_luke.py b/src/transformers/models/luke/configuration_luke.py index f5a86411f4be..51c2dfaf1be4 100644 --- a/src/transformers/models/luke/configuration_luke.py +++ b/src/transformers/models/luke/configuration_luke.py @@ -13,14 +13,14 @@ # limitations under the License. """LUKE configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="studio-ousia/luke-base") +@strict(accept_kwargs=True) class LukeConfig(PreTrainedConfig): r""" entity_vocab_size (`int`, *optional*, defaults to 500000): @@ -50,52 +50,26 @@ class LukeConfig(PreTrainedConfig): model_type = "luke" - def __init__( - self, - vocab_size=50267, - entity_vocab_size=500000, - hidden_size=768, - entity_emb_size=256, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - use_entity_aware_attention=True, - classifier_dropout=None, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.entity_vocab_size = entity_vocab_size - self.hidden_size = hidden_size - self.entity_emb_size = entity_emb_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_entity_aware_attention = use_entity_aware_attention - self.classifier_dropout = classifier_dropout + vocab_size: int = 50267 + entity_vocab_size: int = 500000 + hidden_size: int = 768 + entity_emb_size: int = 256 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + use_entity_aware_attention: bool = True + classifier_dropout: float | int | None = None + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + tie_word_embeddings: bool = True __all__ = ["LukeConfig"] diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py index 41af976b6bc2..baa1b9b2c3ea 100644 --- a/src/transformers/models/luke/modeling_luke.py +++ b/src/transformers/models/luke/modeling_luke.py @@ -897,7 +897,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1118,7 +1118,7 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.luke( input_ids=input_ids, @@ -1266,7 +1266,7 @@ def forward( >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) Predicted class: person ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.luke( input_ids=input_ids, @@ -1398,7 +1398,7 @@ def forward( >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) Predicted class: per:cities_of_residence ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.luke( input_ids=input_ids, @@ -1546,7 +1546,7 @@ def forward( Beyoncé PER Los Angeles LOC ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.luke( input_ids=input_ids, @@ -1668,7 +1668,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.luke( input_ids=input_ids, @@ -1795,7 +1795,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.luke( input_ids=input_ids, @@ -1893,7 +1893,7 @@ def forward( Indices of positions of each input entity in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.luke( input_ids=input_ids, @@ -2039,7 +2039,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None diff --git a/src/transformers/models/lw_detr/configuration_lw_detr.py b/src/transformers/models/lw_detr/configuration_lw_detr.py index aa5e5995bd41..ed0bac7a43d2 100644 --- a/src/transformers/models/lw_detr/configuration_lw_detr.py +++ b/src/transformers/models/lw_detr/configuration_lw_detr.py @@ -19,6 +19,8 @@ # limitations under the License. import math +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin, consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -26,6 +28,7 @@ @auto_docstring(checkpoint="AnnaZhang/lwdetr_small_60e_coco") +@strict(accept_kwargs=True) class LwDetrViTConfig(BackboneConfigMixin, PreTrainedConfig): r""" pretrain_image_size (`int`, *optional*, defaults to 224): @@ -57,64 +60,49 @@ class LwDetrViTConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "lw_detr_vit" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - mlp_ratio=4, - hidden_act="gelu", - dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=256, - pretrain_image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - window_block_indices=[], - use_absolute_position_embeddings=True, - out_features=None, - out_indices=None, - cae_init_values: float = 0.1, - num_windows=16, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mlp_ratio = mlp_ratio - self.hidden_act = hidden_act - self.dropout_prob = dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.pretrain_image_size = pretrain_image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.window_block_indices = window_block_indices - self.use_absolute_position_embeddings = use_absolute_position_embeddings - + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + mlp_ratio: int = 4 + hidden_act: str = "gelu" + dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + + image_size: int | list[int] | tuple[int, int] = 256 + pretrain_image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + window_block_indices: list[int] | tuple[int, ...] = () + use_absolute_position_embeddings: bool = True + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + cae_init_values: float = 0.1 + num_windows: int = 16 + + def __post_init__(self, **kwargs): + self.num_windows_side = int(math.sqrt(self.num_windows)) self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) - self.cae_init_values = cae_init_values - if num_windows % math.sqrt(num_windows) != 0: + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.num_windows % math.sqrt(self.num_windows) != 0: raise ValueError( - f"`num_windows` has to be a perfect square, where num_windows % math.sqrt(num_windows) != 0, but got {num_windows}." + f"`num_windows` has to be a perfect square, where num_windows % math.sqrt(num_windows) != 0, but got {self.num_windows}." ) - if image_size / num_windows % math.sqrt(num_windows) != 0: + if self.image_size / self.num_windows % math.sqrt(self.num_windows) != 0: raise ValueError( - f"`image_size` has to be divisible by `num_windows`, where image_size / num_windows % math.sqrt(num_windows) != 0,but got {image_size} and {num_windows}." + f"`image_size` has to be divisible by `num_windows`, where image_size / num_windows % math.sqrt(num_windows) != 0,but got {self.image_size} and {self.num_windows}." ) - self.num_windows = num_windows - self.num_windows_side = int(math.sqrt(num_windows)) @auto_docstring(checkpoint="AnnaZhang/lwdetr_small_60e_coco") +@strict(accept_kwargs=True) class LwDetrConfig(PreTrainedConfig): r""" projector_scale_factors (`list[float]`, *optional*, defaults to `[]`): @@ -167,50 +155,41 @@ class LwDetrConfig(PreTrainedConfig): model_type = "lw_detr" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - # backbone - backbone_config=None, - # projector - projector_scale_factors: list[float] = [], - hidden_expansion=0.5, - c2f_num_blocks=3, - activation_function="silu", - batch_norm_eps=1e-5, - # decoder - d_model=256, - dropout=0.0, - decoder_ffn_dim=2048, - decoder_n_points=4, - decoder_layers: int = 3, - decoder_self_attention_heads: int = 8, - decoder_cross_attention_heads: int = 16, - decoder_activation_function="relu", - # model - num_queries=300, - attention_bias=True, - attention_dropout=0.0, - activation_dropout=0.0, - group_detr: int = 13, - init_std=0.02, - disable_custom_kernels=True, - # loss - class_cost=2, - bbox_cost=5, - giou_cost=2, - mask_loss_coefficient=1, - dice_loss_coefficient=1, - bbox_loss_coefficient=5, - giou_loss_coefficient=2, - eos_coefficient=0.1, - focal_alpha=0.25, - auxiliary_loss=True, - **kwargs, - ): - self.batch_norm_eps = batch_norm_eps - - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + projector_scale_factors: list[float] | tuple[float, ...] = () + hidden_expansion: float = 0.5 + c2f_num_blocks: int = 3 + activation_function: str = "silu" + batch_norm_eps: float = 1e-5 + dropout: float | int = 0.0 + decoder_ffn_dim: int = 2048 + decoder_n_points: int = 4 + decoder_layers: int = 3 + decoder_self_attention_heads: int = 8 + decoder_cross_attention_heads: int = 16 + decoder_activation_function: str = "relu" + num_queries: int = 300 + attention_bias: bool = True + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + group_detr: int = 13 + init_std: float = 0.02 + disable_custom_kernels: bool = True + class_cost: int = 2 + bbox_cost: int = 5 + giou_cost: int = 2 + mask_loss_coefficient: int = 1 + dice_loss_coefficient: int = 1 + bbox_loss_coefficient: int = 5 + giou_loss_coefficient: int = 2 + eos_coefficient: float = 0.1 + focal_alpha: float = 0.25 + auxiliary_loss: bool = True + d_model: int = 256 + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="lw_detr_vit", default_config_kwargs={ "image_size": 1024, @@ -222,48 +201,16 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - # projector - self.projector_scale_factors = projector_scale_factors - for scale in projector_scale_factors: + self.projector_in_channels = [self.d_model] * len(self.projector_scale_factors) + self.projector_out_channels = self.d_model + self.num_feature_levels = len(self.projector_scale_factors) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + for scale in self.projector_scale_factors: if scale not in [0.5, 1.0, 2.0]: raise ValueError(f"Unsupported scale factor: {scale}") - self.projector_in_channels = [d_model] * len(projector_scale_factors) - self.projector_out_channels = d_model - self.activation_function = activation_function - self.hidden_expansion = hidden_expansion - self.c2f_num_blocks = c2f_num_blocks - # decoder - self.d_model = d_model - self.dropout = dropout - self.num_queries = num_queries - self.decoder_ffn_dim = decoder_ffn_dim - self.num_feature_levels = len(self.projector_scale_factors) - self.decoder_n_points = decoder_n_points - self.decoder_layers = decoder_layers - self.decoder_activation_function = decoder_activation_function - self.decoder_self_attention_heads = decoder_self_attention_heads - self.decoder_cross_attention_heads = decoder_cross_attention_heads - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - # model - self.init_std = init_std - self.group_detr = group_detr - # Loss - self.auxiliary_loss = auxiliary_loss - # Hungarian matcher - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - # Loss coefficients - self.dice_loss_coefficient = dice_loss_coefficient - self.bbox_loss_coefficient = bbox_loss_coefficient - self.giou_loss_coefficient = giou_loss_coefficient - self.eos_coefficient = eos_coefficient - self.focal_alpha = focal_alpha - self.disable_custom_kernels = disable_custom_kernels - super().__init__(**kwargs) __all__ = ["LwDetrConfig", "LwDetrViTConfig"] diff --git a/src/transformers/models/lw_detr/modular_lw_detr.py b/src/transformers/models/lw_detr/modular_lw_detr.py index c3e5a37dd0d5..2a792b0ca66b 100644 --- a/src/transformers/models/lw_detr/modular_lw_detr.py +++ b/src/transformers/models/lw_detr/modular_lw_detr.py @@ -17,6 +17,7 @@ from typing import Any import torch +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -27,7 +28,7 @@ from ...modeling_outputs import BackboneOutput, BaseModelOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import ModelOutput, TransformersKwargs, auto_docstring, logging +from ...utils import ModelOutput, TransformersKwargs, auto_docstring from ...utils.generic import can_return_tuple, merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoConfig @@ -52,10 +53,8 @@ ) -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="AnnaZhang/lwdetr_small_60e_coco") +@strict(accept_kwargs=True) class LwDetrViTConfig(VitDetConfig): r""" pretrain_image_size (`int`, *optional*, defaults to 224): @@ -87,68 +86,33 @@ class LwDetrViTConfig(VitDetConfig): model_type = "lw_detr_vit" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - mlp_ratio=4, - hidden_act="gelu", - dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=256, - pretrain_image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - window_block_indices=[], - use_absolute_position_embeddings=True, - out_features=None, - out_indices=None, - cae_init_values: float = 0.1, - num_windows=16, - **kwargs, - ): - super().__init__( - hidden_size=hidden_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - mlp_ratio=mlp_ratio, - hidden_act=hidden_act, - dropout_prob=dropout_prob, - initializer_range=initializer_range, - layer_norm_eps=layer_norm_eps, - image_size=image_size, - pretrain_image_size=pretrain_image_size, - patch_size=patch_size, - num_channels=num_channels, - qkv_bias=qkv_bias, - window_block_indices=window_block_indices, - use_absolute_position_embeddings=use_absolute_position_embeddings, - out_features=out_features, - out_indices=out_indices, - **kwargs, - ) - del self.residual_block_indices - del self.use_relative_position_embeddings - del self.window_size - del self.drop_path_rate + image_size: int | list[int] | tuple[int, int] = 256 + cae_init_values: float = 0.1 + num_windows: int = 16 + + residual_block_indices = AttributeError() + use_relative_position_embeddings = AttributeError() + window_size = AttributeError() + drop_path_rate = AttributeError() - self.cae_init_values = cae_init_values - if num_windows % math.sqrt(num_windows) != 0: + def __post_init__(self, **kwargs): + self.num_windows_side = int(math.sqrt(self.num_windows)) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.num_windows % math.sqrt(self.num_windows) != 0: raise ValueError( - f"`num_windows` has to be a perfect square, where num_windows % math.sqrt(num_windows) != 0, but got {num_windows}." + f"`num_windows` has to be a perfect square, where num_windows % math.sqrt(num_windows) != 0, but got {self.num_windows}." ) - if image_size / num_windows % math.sqrt(num_windows) != 0: + if self.image_size / self.num_windows % math.sqrt(self.num_windows) != 0: raise ValueError( - f"`image_size` has to be divisible by `num_windows`, where image_size / num_windows % math.sqrt(num_windows) != 0,but got {image_size} and {num_windows}." + f"`image_size` has to be divisible by `num_windows`, where image_size / num_windows % math.sqrt(num_windows) != 0,but got {self.image_size} and {self.num_windows}." ) - self.num_windows = num_windows - self.num_windows_side = int(math.sqrt(num_windows)) @auto_docstring(checkpoint="AnnaZhang/lwdetr_small_60e_coco") +@strict(accept_kwargs=True) class LwDetrConfig(PreTrainedConfig): r""" projector_scale_factors (`list[float]`, *optional*, defaults to `[]`): @@ -201,50 +165,41 @@ class LwDetrConfig(PreTrainedConfig): model_type = "lw_detr" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - # backbone - backbone_config=None, - # projector - projector_scale_factors: list[float] = [], - hidden_expansion=0.5, - c2f_num_blocks=3, - activation_function="silu", - batch_norm_eps=1e-5, - # decoder - d_model=256, - dropout=0.0, - decoder_ffn_dim=2048, - decoder_n_points=4, - decoder_layers: int = 3, - decoder_self_attention_heads: int = 8, - decoder_cross_attention_heads: int = 16, - decoder_activation_function="relu", - # model - num_queries=300, - attention_bias=True, - attention_dropout=0.0, - activation_dropout=0.0, - group_detr: int = 13, - init_std=0.02, - disable_custom_kernels=True, - # loss - class_cost=2, - bbox_cost=5, - giou_cost=2, - mask_loss_coefficient=1, - dice_loss_coefficient=1, - bbox_loss_coefficient=5, - giou_loss_coefficient=2, - eos_coefficient=0.1, - focal_alpha=0.25, - auxiliary_loss=True, - **kwargs, - ): - self.batch_norm_eps = batch_norm_eps - - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + projector_scale_factors: list[float] | tuple[float, ...] = () + hidden_expansion: float = 0.5 + c2f_num_blocks: int = 3 + activation_function: str = "silu" + batch_norm_eps: float = 1e-5 + dropout: float | int = 0.0 + decoder_ffn_dim: int = 2048 + decoder_n_points: int = 4 + decoder_layers: int = 3 + decoder_self_attention_heads: int = 8 + decoder_cross_attention_heads: int = 16 + decoder_activation_function: str = "relu" + num_queries: int = 300 + attention_bias: bool = True + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + group_detr: int = 13 + init_std: float = 0.02 + disable_custom_kernels: bool = True + class_cost: int = 2 + bbox_cost: int = 5 + giou_cost: int = 2 + mask_loss_coefficient: int = 1 + dice_loss_coefficient: int = 1 + bbox_loss_coefficient: int = 5 + giou_loss_coefficient: int = 2 + eos_coefficient: float = 0.1 + focal_alpha: float = 0.25 + auxiliary_loss: bool = True + d_model: int = 256 + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="lw_detr_vit", default_config_kwargs={ "image_size": 1024, @@ -256,48 +211,16 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - # projector - self.projector_scale_factors = projector_scale_factors - for scale in projector_scale_factors: + self.projector_in_channels = [self.d_model] * len(self.projector_scale_factors) + self.projector_out_channels = self.d_model + self.num_feature_levels = len(self.projector_scale_factors) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + for scale in self.projector_scale_factors: if scale not in [0.5, 1.0, 2.0]: raise ValueError(f"Unsupported scale factor: {scale}") - self.projector_in_channels = [d_model] * len(projector_scale_factors) - self.projector_out_channels = d_model - self.activation_function = activation_function - self.hidden_expansion = hidden_expansion - self.c2f_num_blocks = c2f_num_blocks - # decoder - self.d_model = d_model - self.dropout = dropout - self.num_queries = num_queries - self.decoder_ffn_dim = decoder_ffn_dim - self.num_feature_levels = len(self.projector_scale_factors) - self.decoder_n_points = decoder_n_points - self.decoder_layers = decoder_layers - self.decoder_activation_function = decoder_activation_function - self.decoder_self_attention_heads = decoder_self_attention_heads - self.decoder_cross_attention_heads = decoder_cross_attention_heads - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - # model - self.init_std = init_std - self.group_detr = group_detr - # Loss - self.auxiliary_loss = auxiliary_loss - # Hungarian matcher - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - # Loss coefficients - self.dice_loss_coefficient = dice_loss_coefficient - self.bbox_loss_coefficient = bbox_loss_coefficient - self.giou_loss_coefficient = giou_loss_coefficient - self.eos_coefficient = eos_coefficient - self.focal_alpha = focal_alpha - self.disable_custom_kernels = disable_custom_kernels - super().__init__(**kwargs) class LwDetrViTSelfAttention(ViTSelfAttention): diff --git a/src/transformers/models/lxmert/configuration_lxmert.py b/src/transformers/models/lxmert/configuration_lxmert.py index 6b405838e5db..1d60d1bf38b9 100644 --- a/src/transformers/models/lxmert/configuration_lxmert.py +++ b/src/transformers/models/lxmert/configuration_lxmert.py @@ -13,14 +13,14 @@ # limitations under the License. """LXMERT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="unc-nlp/lxmert-base-uncased") +@strict(accept_kwargs=True) class LxmertConfig(PreTrainedConfig): r""" num_qa_labels (`int`, *optional*, defaults to 9500): @@ -69,72 +69,40 @@ class LxmertConfig(PreTrainedConfig): model_type = "lxmert" attribute_map = {} - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_attention_heads=12, - num_qa_labels=9500, - num_object_labels=1600, - num_attr_labels=400, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - l_layers=9, - x_layers=5, - r_layers=5, - visual_feat_dim=2048, - visual_pos_dim=4, - visual_loss_normalizer=6.67, - task_matched=True, - task_mask_lm=True, - task_obj_predict=True, - task_qa=True, - visual_obj_loss=True, - visual_attr_loss=True, - visual_feat_loss=True, - pad_token_id=None, - bos_token_id=None, - eos_token_id=None, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.num_qa_labels = num_qa_labels - self.num_object_labels = num_object_labels - self.num_attr_labels = num_attr_labels - self.l_layers = l_layers - self.x_layers = x_layers - self.r_layers = r_layers - self.visual_feat_dim = visual_feat_dim - self.visual_pos_dim = visual_pos_dim - self.visual_loss_normalizer = visual_loss_normalizer - self.task_matched = task_matched - self.task_mask_lm = task_mask_lm - self.task_obj_predict = task_obj_predict - self.task_qa = task_qa - self.visual_obj_loss = visual_obj_loss - self.visual_attr_loss = visual_attr_loss - self.visual_feat_loss = visual_feat_loss - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers} - super().__init__(**kwargs) + vocab_size: int = 30522 + hidden_size: int = 768 + num_attention_heads: int = 12 + num_qa_labels: int = 9500 + num_object_labels: int = 1600 + num_attr_labels: int = 400 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + l_layers: int = 9 + x_layers: int = 5 + r_layers: int = 5 + visual_feat_dim: int = 2048 + visual_pos_dim: int = 4 + visual_loss_normalizer: float = 6.67 + task_matched: bool = True + task_mask_lm: bool = True + task_obj_predict: bool = True + task_qa: bool = True + visual_obj_loss: bool = True + visual_attr_loss: bool = True + visual_feat_loss: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.num_hidden_layers = {"vision": self.r_layers, "cross_encoder": self.x_layers, "language": self.l_layers} + super().__post_init__(**kwargs) __all__ = ["LxmertConfig"] diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index 4a96f38451df..425d47978f0d 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -734,7 +734,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1037,7 +1037,7 @@ def forward( a one hot representation hof the correct answer *optional* """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict device = input_ids.device if input_ids is not None else inputs_embeds.device lxmert_output = self.lxmert( @@ -1258,7 +1258,7 @@ def forward( labels (`Torch.Tensor` of shape `(batch_size)`, *optional*): A one-hot representation of the correct answer """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict lxmert_output = self.lxmert( input_ids=input_ids, diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py index af35d2eaf13b..498c4a9a3562 100644 --- a/src/transformers/models/m2m_100/configuration_m2m_100.py +++ b/src/transformers/models/m2m_100/configuration_m2m_100.py @@ -13,14 +13,14 @@ # limitations under the License. """M2M100 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/m2m100_418M") +@strict(accept_kwargs=True) class M2M100Config(PreTrainedConfig): r""" Example: @@ -40,62 +40,36 @@ class M2M100Config(PreTrainedConfig): model_type = "m2m_100" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} - - def __init__( - self, - vocab_size=128112, - max_position_embeddings=1024, - encoder_layers=12, - encoder_ffn_dim=4096, - encoder_attention_heads=16, - decoder_layers=12, - decoder_ffn_dim=4096, - decoder_attention_heads=16, - encoder_layerdrop=0.05, - decoder_layerdrop=0.05, - use_cache=True, - is_encoder_decoder=True, - activation_function="relu", - d_model=1024, - dropout=0.1, - attention_dropout=0.1, - activation_dropout=0.0, - init_std=0.02, - decoder_start_token_id=2, - scale_embedding=True, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", + } - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + vocab_size: int = 128112 + max_position_embeddings: int = 1024 + encoder_layers: int = 12 + encoder_ffn_dim: int = 4096 + encoder_attention_heads: int = 16 + decoder_layers: int = 12 + decoder_ffn_dim: int = 4096 + decoder_attention_heads: int = 16 + encoder_layerdrop: float | int = 0.05 + decoder_layerdrop: float | int = 0.05 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "relu" + d_model: int = 1024 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + decoder_start_token_id: int | None = 2 + scale_embedding: int = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + tie_word_embeddings: bool = True __all__ = ["M2M100Config"] diff --git a/src/transformers/models/mamba/configuration_mamba.py b/src/transformers/models/mamba/configuration_mamba.py index 68533fe90c0b..61bb91416b0f 100644 --- a/src/transformers/models/mamba/configuration_mamba.py +++ b/src/transformers/models/mamba/configuration_mamba.py @@ -15,14 +15,14 @@ import math -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="state-spaces/mamba-2.8b") +@strict(accept_kwargs=True) class MambaConfig(PreTrainedConfig): """ layer_norm_epsilon (`float`, *optional*, defaults to 1e-05): @@ -64,65 +64,39 @@ class MambaConfig(PreTrainedConfig): model_type = "mamba" - def __init__( - self, - vocab_size=50280, - hidden_size=768, - state_size=16, - num_hidden_layers=32, - layer_norm_epsilon=1e-5, - pad_token_id=0, - bos_token_id=0, - eos_token_id=0, - expand=2, - conv_kernel=4, - use_bias=False, - use_conv_bias=True, - hidden_act="silu", - initializer_range=0.1, - residual_in_fp32=True, - time_step_rank="auto", - time_step_scale=1.0, - time_step_min=0.001, - time_step_max=0.1, - time_step_init_scheme="random", - time_step_floor=1e-4, - rescale_prenorm_residual=False, - use_cache=True, - use_mambapy=False, - use_associative_scan=True, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.state_size = state_size - self.num_hidden_layers = num_hidden_layers - self.layer_norm_epsilon = layer_norm_epsilon - self.conv_kernel = conv_kernel - self.expand = expand - self.intermediate_size = int(expand * self.hidden_size) - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.use_bias = use_bias - self.use_conv_bias = use_conv_bias - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank - self.time_step_scale = time_step_scale - self.time_step_min = time_step_min - self.time_step_max = time_step_max - self.time_step_init_scheme = time_step_init_scheme - self.time_step_floor = time_step_floor - self.rescale_prenorm_residual = rescale_prenorm_residual - self.residual_in_fp32 = residual_in_fp32 - self.use_cache = use_cache - self.use_mambapy = use_mambapy - self.use_associative_scan = use_associative_scan - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + vocab_size: int = 50280 + hidden_size: int = 768 + state_size: int = 16 + num_hidden_layers: int = 32 + layer_norm_epsilon: float = 1e-5 + pad_token_id: int | None = 0 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 0 + expand: int = 2 + conv_kernel: int = 4 + use_bias: bool = False + use_conv_bias: bool = True + hidden_act: str = "silu" + initializer_range: float = 0.1 + residual_in_fp32: bool = True + time_step_rank: str | int = "auto" + time_step_scale: float = 1.0 + time_step_min: float = 0.001 + time_step_max: float = 0.1 + time_step_init_scheme: str = "random" + time_step_floor: float = 1e-4 + rescale_prenorm_residual: bool = False + use_cache: bool = True + use_mambapy: bool = False + use_associative_scan: bool = True + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.intermediate_size = int(self.expand * self.hidden_size) + self.time_step_rank = ( + math.ceil(self.hidden_size / 16) if self.time_step_rank == "auto" else self.time_step_rank + ) + super().__post_init__(**kwargs) __all__ = ["MambaConfig"] diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py index 2cc267ee22d9..73268544ffa6 100644 --- a/src/transformers/models/mamba/modeling_mamba.py +++ b/src/transformers/models/mamba/modeling_mamba.py @@ -671,7 +671,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if (input_ids is None) ^ (inputs_embeds is not None): # ^ is python for xor raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -836,7 +836,7 @@ def forward( use_cache (`bool`, *optional*): If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict mamba_outputs = self.backbone( input_ids, diff --git a/src/transformers/models/mamba2/configuration_mamba2.py b/src/transformers/models/mamba2/configuration_mamba2.py index 575d61520393..8f289a31caa3 100644 --- a/src/transformers/models/mamba2/configuration_mamba2.py +++ b/src/transformers/models/mamba2/configuration_mamba2.py @@ -15,14 +15,14 @@ import math -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="state-spaces/mamba2-2.8b") +@strict(accept_kwargs=True) class Mamba2Config(PreTrainedConfig): """ layer_norm_epsilon (`float`, *optional*, defaults to 1e-05): @@ -59,81 +59,49 @@ class Mamba2Config(PreTrainedConfig): model_type = "mamba2" - def __init__( - self, - num_heads=128, - head_dim=64, - vocab_size=32768, - hidden_size=4096, - state_size=128, - num_hidden_layers=64, - layer_norm_epsilon=1e-5, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - expand=2, - conv_kernel=4, - n_groups=8, - use_bias=False, - use_conv_bias=True, - hidden_act="silu", - initializer_range=0.1, - residual_in_fp32=True, - time_step_rank="auto", - time_step_min=0.001, - time_step_max=0.1, - time_step_floor=1e-4, - time_step_limit=(0.0, float("inf")), - rescale_prenorm_residual=False, - use_cache=True, - rms_norm=True, - chunk_size=256, - tie_word_embeddings=False, - **kwargs, - ): - if (hidden_size * expand) != (num_heads * head_dim): + num_heads: int = 128 + head_dim: int = 64 + vocab_size: int = 32768 + hidden_size: int = 4096 + state_size: int = 128 + num_hidden_layers: int = 64 + layer_norm_epsilon: float = 1e-5 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + expand: int = 2 + conv_kernel: int = 4 + n_groups: int = 8 + use_bias: bool = False + use_conv_bias: bool = True + hidden_act: str = "silu" + initializer_range: float = 0.1 + residual_in_fp32: bool = True + time_step_rank: str | int = "auto" + time_step_min: float = 0.001 + time_step_max: float = 0.1 + time_step_floor: float = 1e-4 + time_step_limit: list[float] | tuple[float, ...] = (0.0, float("inf")) + rescale_prenorm_residual: bool = False + use_cache: bool = True + rms_norm: bool = True + chunk_size: int = 256 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + self.time_step_rank = ( + math.ceil(self.hidden_size / 16) if self.time_step_rank == "auto" else self.time_step_rank + ) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if (self.hidden_size * self.expand) != (self.num_heads * self.head_dim): raise ValueError( "Inconsistent configuration: hidden_size * expand " - f"({hidden_size * expand}) must equal num_heads * head_dim " - f"({num_heads * head_dim})." + f"({self.hidden_size * self.expand}) must equal num_heads * head_dim " + f"({self.num_heads * self.head_dim})." ) - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.state_size = state_size - self.num_hidden_layers = num_hidden_layers - self.layer_norm_epsilon = layer_norm_epsilon - self.conv_kernel = conv_kernel - self.expand = expand - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.use_bias = use_bias - self.use_conv_bias = use_conv_bias - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank - self.time_step_min = time_step_min - self.time_step_max = time_step_max - self.time_step_floor = time_step_floor - self.rescale_prenorm_residual = rescale_prenorm_residual - self.residual_in_fp32 = residual_in_fp32 - self.use_cache = use_cache - self.n_groups = n_groups - self.num_heads = num_heads - self.head_dim = head_dim - self.rms_norm = rms_norm - self.state_size = state_size - self.chunk_size = chunk_size - self.time_step_limit = time_step_limit - self.tie_word_embeddings = tie_word_embeddings - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) - __all__ = ["Mamba2Config"] diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py index 2f3dbe1d1598..a6f7469a2928 100644 --- a/src/transformers/models/mamba2/modeling_mamba2.py +++ b/src/transformers/models/mamba2/modeling_mamba2.py @@ -874,7 +874,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if (input_ids is None) ^ (inputs_embeds is not None): # ^ is python for xor raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1024,7 +1024,7 @@ def forward( The position of the current input in the cache. This is used to ensure that the cache is correctly updated. If `cache_params` is passed, `cache_position` should also be passed. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict mamba2_outputs = self.backbone( input_ids, diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py index 6c7f5829ad64..90e796fa4d3e 100644 --- a/src/transformers/models/marian/configuration_marian.py +++ b/src/transformers/models/marian/configuration_marian.py @@ -13,14 +13,14 @@ # limitations under the License. """Marian model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="Helsinki-NLP/opus-mt-en-de") +@strict(accept_kwargs=True) class MarianConfig(PreTrainedConfig): r""" share_encoder_decoder_embeddings (`bool`, *optional*, defaults to `True`): @@ -45,73 +45,44 @@ class MarianConfig(PreTrainedConfig): model_type = "marian" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", + } + + vocab_size: int = 58101 + decoder_vocab_size: int | None = None + max_position_embeddings: int = 1024 + encoder_layers: int = 12 + encoder_ffn_dim: int = 4096 + encoder_attention_heads: int = 16 + decoder_layers: int = 12 + decoder_ffn_dim: int = 4096 + decoder_attention_heads: int = 16 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "gelu" + d_model: int = 1024 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + decoder_start_token_id: int = 58100 + scale_embedding: bool = False + pad_token_id: int | None = 58100 + eos_token_id: int | None = 0 + bos_token_id: int | None = None + forced_eos_token_id: int | None = 0 + share_encoder_decoder_embeddings: bool = True + is_decoder: bool = False + tie_word_embeddings: bool = True - def __init__( - self, - vocab_size=58101, - decoder_vocab_size=None, - max_position_embeddings=1024, - encoder_layers=12, - encoder_ffn_dim=4096, - encoder_attention_heads=16, - decoder_layers=12, - decoder_ffn_dim=4096, - decoder_attention_heads=16, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - use_cache=True, - is_encoder_decoder=True, - activation_function="gelu", - d_model=1024, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - decoder_start_token_id=58100, - scale_embedding=False, - pad_token_id=58100, - eos_token_id=0, - bos_token_id=None, - forced_eos_token_id=0, - share_encoder_decoder_embeddings=True, - is_decoder=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.decoder_vocab_size = decoder_vocab_size or vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.share_encoder_decoder_embeddings = share_encoder_decoder_embeddings - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.bos_token_id = bos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = share_encoder_decoder_embeddings - super().__init__( - is_encoder_decoder=is_encoder_decoder, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) + def __post_init__(self, **kwargs): + self.decoder_vocab_size = self.decoder_vocab_size or self.vocab_size + super().__post_init__(**kwargs) __all__ = ["MarianConfig"] diff --git a/src/transformers/models/markuplm/configuration_markuplm.py b/src/transformers/models/markuplm/configuration_markuplm.py index 4811d344deda..fecd089f55de 100644 --- a/src/transformers/models/markuplm/configuration_markuplm.py +++ b/src/transformers/models/markuplm/configuration_markuplm.py @@ -13,14 +13,14 @@ # limitations under the License. """MarkupLM model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/markuplm-base") +@strict(accept_kwargs=True) class MarkupLMConfig(PreTrainedConfig): r""" max_tree_id_unit_embeddings (`int`, *optional*, defaults to 1024): @@ -61,58 +61,29 @@ class MarkupLMConfig(PreTrainedConfig): model_type = "markuplm" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=0, - eos_token_id=2, - max_xpath_tag_unit_embeddings=256, - max_xpath_subs_unit_embeddings=1024, - tag_pad_id=216, - subs_pad_id=1001, - xpath_unit_hidden_size=32, - max_depth=50, - use_cache=True, - classifier_dropout=None, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout - # additional properties - self.max_depth = max_depth - self.max_xpath_tag_unit_embeddings = max_xpath_tag_unit_embeddings - self.max_xpath_subs_unit_embeddings = max_xpath_subs_unit_embeddings - self.tag_pad_id = tag_pad_id - self.subs_pad_id = subs_pad_id - self.xpath_unit_hidden_size = xpath_unit_hidden_size + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + max_xpath_tag_unit_embeddings: int = 256 + max_xpath_subs_unit_embeddings: int = 1024 + tag_pad_id: int = 216 + subs_pad_id: int = 1001 + xpath_unit_hidden_size: int = 32 + max_depth: int = 50 + use_cache: bool = True + classifier_dropout: float | int | None = None __all__ = ["MarkupLMConfig"] diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py index 93efded95ce3..01e6a47317f0 100644 --- a/src/transformers/models/mask2former/configuration_mask2former.py +++ b/src/transformers/models/mask2former/configuration_mask2former.py @@ -13,6 +13,8 @@ # limitations under the License. """Mask2Former model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="facebook/mask2former-swin-small-coco-instance") +@strict(accept_kwargs=True) class Mask2FormerConfig(PreTrainedConfig): r""" feature_size (`int`, *optional*, defaults to 256): @@ -76,42 +79,40 @@ class Mask2FormerConfig(PreTrainedConfig): model_type = "mask2former" sub_configs = {"backbone_config": AutoConfig} backbones_supported = ["swin"] - attribute_map = {"hidden_size": "hidden_dim"} - - def __init__( - self, - backbone_config: dict | PreTrainedConfig | None = None, - feature_size: int = 256, - mask_feature_size: int = 256, - hidden_dim: int = 256, - encoder_feedforward_dim: int = 1024, - activation_function: str = "relu", - encoder_layers: int = 6, - decoder_layers: int = 10, - num_attention_heads: int = 8, - dropout: float = 0.0, - dim_feedforward: int = 2048, - pre_norm: bool = False, - enforce_input_projection: bool = False, - common_stride: int = 4, - ignore_value: int = 255, - num_queries: int = 100, - no_object_weight: float = 0.1, - class_weight: float = 2.0, - mask_weight: float = 5.0, - dice_weight: float = 5.0, - train_num_points: int = 12544, - oversample_ratio: float = 3.0, - importance_sample_ratio: float = 0.75, - init_std: float = 0.02, - init_xavier_std: float = 1.0, - use_auxiliary_loss: bool = True, - feature_strides: list[int] = [4, 8, 16, 32], - output_auxiliary_logits: bool | None = None, - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + attribute_map = {"hidden_size": "hidden_dim", "num_hidden_layers": "decoder_layers"} + + backbone_config: dict | PreTrainedConfig | None = None + feature_size: int = 256 + mask_feature_size: int = 256 + hidden_dim: int = 256 + encoder_feedforward_dim: int = 1024 + activation_function: str = "relu" + encoder_layers: int = 6 + decoder_layers: int = 10 + num_attention_heads: int = 8 + dropout: float | int = 0.0 + dim_feedforward: int = 2048 + pre_norm: bool = False + enforce_input_projection: bool = False + common_stride: int = 4 + ignore_value: int = 255 + num_queries: int = 100 + no_object_weight: float = 0.1 + class_weight: float = 2.0 + mask_weight: float = 5.0 + dice_weight: float = 5.0 + train_num_points: int = 12544 + oversample_ratio: float = 3.0 + importance_sample_ratio: float = 0.75 + init_std: float = 0.02 + init_xavier_std: float = 1.0 + use_auxiliary_loss: bool = True + feature_strides: list[int] | tuple[int, ...] = (4, 8, 16, 32) + output_auxiliary_logits: bool | None = None + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="swin", default_config_kwargs={ "depths": [2, 2, 18, 2], @@ -121,44 +122,13 @@ def __init__( **kwargs, ) - # verify that the backbone is supported - if backbone_config.model_type not in self.backbones_supported: + if self.backbone_config.model_type not in self.backbones_supported: logger.warning_once( - f"Backbone {backbone_config.model_type} is not a supported model and may not be compatible with Mask2Former. " + f"Backbone {self.backbone_config.model_type} is not a supported model and may not be compatible with Mask2Former. " f"Supported model types: {','.join(self.backbones_supported)}" ) - self.backbone_config = backbone_config - self.feature_size = feature_size - self.mask_feature_size = mask_feature_size - self.hidden_dim = hidden_dim - self.encoder_feedforward_dim = encoder_feedforward_dim - self.activation_function = activation_function - self.encoder_layers = encoder_layers - self.decoder_layers = decoder_layers - self.num_attention_heads = num_attention_heads - self.dropout = dropout - self.dim_feedforward = dim_feedforward - self.pre_norm = pre_norm - self.enforce_input_projection = enforce_input_projection - self.common_stride = common_stride - self.ignore_value = ignore_value - self.num_queries = num_queries - self.no_object_weight = no_object_weight - self.class_weight = class_weight - self.mask_weight = mask_weight - self.dice_weight = dice_weight - self.train_num_points = train_num_points - self.oversample_ratio = oversample_ratio - self.importance_sample_ratio = importance_sample_ratio - self.init_std = init_std - self.init_xavier_std = init_xavier_std - self.use_auxiliary_loss = use_auxiliary_loss - self.feature_strides = feature_strides - self.output_auxiliary_logits = output_auxiliary_logits - self.num_hidden_layers = decoder_layers - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Mask2FormerConfig"] diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index 3d6a2acd0968..b20646bc2cb4 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -1166,7 +1166,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict hidden_states = inputs_embeds reference_points = self.get_reference_points(spatial_shapes_list, valid_ratios, device=inputs_embeds.device) @@ -1842,7 +1842,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if inputs_embeds is not None: hidden_states = inputs_embeds @@ -2197,7 +2197,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict batch_size, _, height, width = pixel_values.shape @@ -2437,7 +2437,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.model( pixel_values=pixel_values, diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py index 3b913ccfce13..69c65a435c7e 100644 --- a/src/transformers/models/maskformer/configuration_maskformer.py +++ b/src/transformers/models/maskformer/configuration_maskformer.py @@ -13,6 +13,8 @@ # limitations under the License. """MaskFormer model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -24,6 +26,7 @@ @auto_docstring(checkpoint="facebook/maskformer-swin-base-ade") +@strict(accept_kwargs=True) class MaskFormerConfig(PreTrainedConfig): r""" cross_entropy_weight (`float`, *optional*, defaults to 1.0): @@ -67,24 +70,22 @@ class MaskFormerConfig(PreTrainedConfig): backbones_supported = ["resnet", "swin"] decoders_supported = ["detr"] - def __init__( - self, - fpn_feature_size: int = 256, - mask_feature_size: int = 256, - no_object_weight: float = 0.1, - use_auxiliary_loss: bool = False, - backbone_config: dict | PreTrainedConfig | None = None, - decoder_config: dict | None = None, - init_std: float = 0.02, - init_xavier_std: float = 1.0, - dice_weight: float = 1.0, - cross_entropy_weight: float = 1.0, - mask_weight: float = 20.0, - output_auxiliary_logits: bool | None = None, - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + fpn_feature_size: int = 256 + mask_feature_size: int = 256 + no_object_weight: float = 0.1 + use_auxiliary_loss: bool = False + backbone_config: dict | PreTrainedConfig | None = None + decoder_config: dict | PreTrainedConfig | None = None + init_std: float = 0.02 + init_xavier_std: float = 1.0 + dice_weight: float = 1.0 + cross_entropy_weight: float = 1.0 + mask_weight: float = 20.0 + output_auxiliary_logits: bool | None = None + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="swin", default_config_kwargs={ "depths": [2, 2, 18, 2], @@ -99,48 +100,34 @@ def __init__( ) # verify that the backbone is supported - if backbone_config is not None and backbone_config.model_type not in self.backbones_supported: + if self.backbone_config is not None and self.backbone_config.model_type not in self.backbones_supported: logger.warning_once( - f"Backbone {backbone_config.model_type} is not a supported model and may not be compatible with MaskFormer. " + f"Backbone {self.backbone_config.model_type} is not a supported model and may not be compatible with MaskFormer. " f"Supported model types: {','.join(self.backbones_supported)}" ) - if decoder_config is None: + if self.decoder_config is None: # fall back to https://huggingface.co/facebook/detr-resnet-50 - decoder_config = DetrConfig() + self.decoder_config = DetrConfig() else: # verify that the decoder is supported decoder_type = ( - decoder_config.pop("model_type") if isinstance(decoder_config, dict) else decoder_config.model_type + self.decoder_config.pop("model_type") + if isinstance(self.decoder_config, dict) + else self.decoder_config.model_type ) if decoder_type not in self.decoders_supported: raise ValueError( f"Transformer Decoder {decoder_type} not supported, please use one of" f" {','.join(self.decoders_supported)}" ) - if isinstance(decoder_config, dict): + if isinstance(self.decoder_config, dict): config_class = CONFIG_MAPPING[decoder_type] - decoder_config = config_class.from_dict(decoder_config) - - self.backbone_config = backbone_config - self.decoder_config = decoder_config - # main feature dimension for the model - self.fpn_feature_size = fpn_feature_size - self.mask_feature_size = mask_feature_size - # initializer - self.init_std = init_std - self.init_xavier_std = init_xavier_std - # Hungarian matcher && loss - self.cross_entropy_weight = cross_entropy_weight - self.dice_weight = dice_weight - self.mask_weight = mask_weight - self.use_auxiliary_loss = use_auxiliary_loss - self.no_object_weight = no_object_weight - self.output_auxiliary_logits = output_auxiliary_logits + self.decoder_config = config_class.from_dict(self.decoder_config) self.num_attention_heads = self.decoder_config.encoder_attention_heads self.num_hidden_layers = self.decoder_config.num_hidden_layers - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["MaskFormerConfig"] diff --git a/src/transformers/models/maskformer/configuration_maskformer_swin.py b/src/transformers/models/maskformer/configuration_maskformer_swin.py index f8596220b29c..3d935ceded04 100644 --- a/src/transformers/models/maskformer/configuration_maskformer_swin.py +++ b/src/transformers/models/maskformer/configuration_maskformer_swin.py @@ -13,15 +13,15 @@ # limitations under the License. """MaskFormer Swin Transformer model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/swin-tiny-patch4-window7-224") +@strict(accept_kwargs=True) class MaskFormerSwinConfig(BackboneConfigMixin, PreTrainedConfig): r""" window_size (`int`, *optional*, defaults to 7): @@ -49,52 +49,34 @@ class MaskFormerSwinConfig(BackboneConfigMixin, PreTrainedConfig): "num_hidden_layers": "num_layers", } - def __init__( - self, - image_size=224, - patch_size=4, - num_channels=3, - embed_dim=96, - depths=[2, 2, 6, 2], - num_heads=[3, 6, 12, 24], - window_size=7, - mlp_ratio=4.0, - qkv_bias=True, - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - drop_path_rate=0.1, - hidden_act="gelu", - use_absolute_embeddings=False, - initializer_range=0.02, - layer_norm_eps=1e-5, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.embed_dim = embed_dim - self.depths = depths - self.num_layers = len(depths) - self.num_heads = num_heads - self.window_size = window_size - self.mlp_ratio = mlp_ratio - self.qkv_bias = qkv_bias - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.drop_path_rate = drop_path_rate - self.hidden_act = hidden_act - self.use_absolute_embeddings = use_absolute_embeddings - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 4 + num_channels: int = 3 + embed_dim: int = 96 + depths: list[int] | tuple[int, ...] = (2, 2, 6, 2) + num_heads: list[int] | tuple[int, ...] = (3, 6, 12, 24) + window_size: int = 7 + mlp_ratio: float = 4.0 + qkv_bias: bool = True + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + drop_path_rate: float = 0.1 + hidden_act: str = "gelu" + use_absolute_embeddings: bool = False + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + + def __post_init__(self, **kwargs): # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel # this indicates the channel dimension after the last stage of the model - self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.hidden_size = int(self.embed_dim * 2 ** (len(self.depths) - 1)) + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["MaskFormerSwinConfig"] diff --git a/src/transformers/models/maskformer/image_processing_maskformer_fast.py b/src/transformers/models/maskformer/image_processing_maskformer_fast.py index 33abc9b8f38b..bfd6ea61aeae 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer_fast.py +++ b/src/transformers/models/maskformer/image_processing_maskformer_fast.py @@ -14,7 +14,7 @@ """Fast Image processor class for MaskFormer.""" import math -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import Any, Optional, Union import torch import torchvision.transforms.v2.functional as tvF @@ -55,10 +55,6 @@ logger = logging.get_logger(__name__) -if TYPE_CHECKING: - pass - - def convert_segmentation_map_to_binary_masks_fast( segmentation_map: "torch.Tensor", instance_id_to_semantic_id: dict[int, int] | None = None, diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index cacb86b788ed..9e565eed356e 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -712,7 +712,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if inputs_embeds is not None: hidden_states = inputs_embeds @@ -1548,7 +1548,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict batch_size, _, height, width = pixel_values.shape @@ -1770,7 +1770,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict raw_outputs = self.model( pixel_values, diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py index 8455ad0b282a..fc30dd865dc0 100644 --- a/src/transformers/models/maskformer/modeling_maskformer_swin.py +++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py @@ -746,7 +746,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -821,7 +821,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> BackboneOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py index 9527fbb25828..094cc06742f0 100644 --- a/src/transformers/models/mbart/configuration_mbart.py +++ b/src/transformers/models/mbart/configuration_mbart.py @@ -13,14 +13,14 @@ # limitations under the License. """MBART model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/mbart-large-cc25") +@strict(accept_kwargs=True) class MBartConfig(PreTrainedConfig): r""" Example: @@ -40,70 +40,39 @@ class MBartConfig(PreTrainedConfig): model_type = "mbart" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", + } - def __init__( - self, - vocab_size=50265, - max_position_embeddings=1024, - encoder_layers=12, - encoder_ffn_dim=4096, - encoder_attention_heads=16, - decoder_layers=12, - decoder_ffn_dim=4096, - decoder_attention_heads=16, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - use_cache=True, - is_encoder_decoder=True, - activation_function="gelu", - d_model=1024, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - classifier_dropout=0.0, - scale_embedding=False, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - decoder_start_token_id=None, - forced_eos_token_id=2, - is_decoder=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.classifier_dropout = classifier_dropout - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__( - is_encoder_decoder=is_encoder_decoder, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) + vocab_size: int = 50265 + max_position_embeddings: int = 1024 + encoder_layers: int = 12 + encoder_ffn_dim: int = 4096 + encoder_attention_heads: int = 16 + decoder_layers: int = 12 + decoder_ffn_dim: int = 4096 + decoder_attention_heads: int = 16 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "gelu" + d_model: int = 1024 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + classifier_dropout: float | int = 0.0 + scale_embedding: int = False + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + decoder_start_token_id: int | None = None + forced_eos_token_id: int | None = 2 + is_decoder: bool = False + tie_word_embeddings: bool = True __all__ = ["MBartConfig"] diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 3bca7b630915..921237259acf 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -991,7 +991,7 @@ def forward( ['nett', 'sehr', 'ganz', 'nicht', 'so'] ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: if use_cache: diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py index 4b6d4de5b674..0264b69c2a27 100644 --- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py +++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py @@ -13,14 +13,14 @@ # limitations under the License. """MEGATRON_BERT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="nvidia/megatron-bert-uncased-345m") +@strict(accept_kwargs=True) class MegatronBertConfig(PreTrainedConfig): r""" Examples: @@ -40,50 +40,25 @@ class MegatronBertConfig(PreTrainedConfig): model_type = "megatron-bert" - def __init__( - self, - vocab_size=29056, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - intermediate_size=4096, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=None, - eos_token_id=None, - use_cache=True, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache + vocab_size: int = 29056 + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + intermediate_size: int = 4096 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + use_cache: bool = True + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["MegatronBertConfig"] diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index 4326d2807be1..4ef7526bddd1 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -613,7 +613,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -764,7 +764,7 @@ def forward( >>> prediction_logits = outputs.prediction_logits >>> seq_relationship_logits = outputs.seq_relationship_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.bert( input_ids, @@ -870,7 +870,7 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: use_cache = False @@ -965,7 +965,7 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.bert( input_ids, @@ -1055,7 +1055,7 @@ def forward( >>> assert logits[0, 0] < logits[0, 1] # next sentence was random ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.bert( input_ids, @@ -1127,7 +1127,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.bert( input_ids, @@ -1235,7 +1235,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1313,7 +1313,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.bert( input_ids, @@ -1375,7 +1375,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.bert( input_ids, diff --git a/src/transformers/models/metaclip_2/configuration_metaclip_2.py b/src/transformers/models/metaclip_2/configuration_metaclip_2.py index 7737bc155202..6e7e62b6ffc4 100644 --- a/src/transformers/models/metaclip_2/configuration_metaclip_2.py +++ b/src/transformers/models/metaclip_2/configuration_metaclip_2.py @@ -4,6 +4,23 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_metaclip_2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -12,6 +29,7 @@ @auto_docstring(checkpoint="facebook/metaclip-2-worldwide-huge-quickgelu") +@strict(accept_kwargs=True) class MetaClip2TextConfig(PreTrainedConfig): r""" Example: @@ -32,47 +50,36 @@ class MetaClip2TextConfig(PreTrainedConfig): model_type = "metaclip_2_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=49408, - hidden_size=512, - intermediate_size=2048, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=8, - max_position_embeddings=77, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - # This differs from `MetaClip2Tokenizer`'s default and from openai/metaclip_2 - # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 - pad_token_id=1, - bos_token_id=49406, - eos_token_id=49407, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout + vocab_size: int = 49408 + hidden_size: int = 512 + intermediate_size: int = 2048 + projection_dim: int | None = 512 + num_hidden_layers: int = 12 + num_attention_heads: int = 8 + max_position_embeddings: int = 77 + hidden_act: str = "quick_gelu" + layer_norm_eps: float | None = 1e-5 + attention_dropout: int | float | None = 0.0 + initializer_range: float = 0.02 + initializer_factor: float | None = 1.0 + + # This differs from `MetaClip2Tokenizer`'s default and from openai/metaclip_2 + # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 + pad_token_id: int | None = 1 + bos_token_id: int | None = 49406 + eos_token_id: int | list[int] | None = 49407 + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) @auto_docstring(checkpoint="facebook/metaclip-2-worldwide-huge-quickgelu") +@strict(accept_kwargs=True) class MetaClip2VisionConfig(PreTrainedConfig): r""" Example: @@ -93,41 +100,31 @@ class MetaClip2VisionConfig(PreTrainedConfig): model_type = "metaclip_2_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=32, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act + hidden_size: int = 768 + intermediate_size: int = 3072 + projection_dim: int | None = 512 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int | None = 3 + image_size: int | None = 224 + patch_size: int | None = 32 + hidden_act: str = "quick_gelu" + layer_norm_eps: float | None = 1e-5 + attention_dropout: int | float | None = 0.0 + initializer_range: float = 0.02 + initializer_factor: float | None = 1.0 + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) @auto_docstring(checkpoint="facebook/metaclip-2-worldwide-huge-quickgelu") +@strict(accept_kwargs=True) class MetaClip2Config(PreTrainedConfig): r""" Example: @@ -157,22 +154,37 @@ class MetaClip2Config(PreTrainedConfig): model_type = "metaclip_2" sub_configs = {"text_config": MetaClip2TextConfig, "vision_config": MetaClip2VisionConfig} - def __init__( - self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs - ): - # If `_config_dict` exist, we use them for the backward compatibility. - # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot - # of confusion!). - text_config_dict = kwargs.pop("text_config_dict", None) - vision_config_dict = kwargs.pop("vision_config_dict", None) + text_config: dict | MetaClip2TextConfig | None = None + vision_config: dict | MetaClip2VisionConfig | None = None + projection_dim: int | None = 512 + logit_scale_init_value: float | int | None = 2.6592 + initializer_factor: float | None = 1.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `MetaClip2TextConfig` with default values.") + elif isinstance(self.text_config, MetaClip2TextConfig): + text_config = self.text_config.to_dict() + else: + text_config = self.text_config + + if self.vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. initializing the `MetaClip2VisionConfig` with default values.") + elif isinstance(self.vision_config, MetaClip2VisionConfig): + vision_config = self.vision_config.to_dict() + else: + vision_config = self.vision_config + # For backward compatibility check keyword args # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. - if text_config_dict is not None: - if text_config is None: - text_config = {} + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + if text_config_dict is not None: # This is the complete result when using `text_config_dict`. _text_config_dict = MetaClip2TextConfig(**text_config_dict).to_dict() @@ -197,9 +209,6 @@ def __init__( text_config.update(_text_config_dict) if vision_config_dict is not None: - if vision_config is None: - vision_config = {} - # This is the complete result when using `vision_config_dict`. _vision_config_dict = MetaClip2VisionConfig(**vision_config_dict).to_dict() # convert keys to string instead of integer @@ -228,25 +237,11 @@ def __init__( # Update all values in `vision_config` with the ones in `_vision_config_dict`. vision_config.update(_vision_config_dict) - if text_config is None: - text_config = MetaClip2TextConfig() - logger.info("`text_config` is `None`. initializing the `MetaClip2TextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = MetaClip2TextConfig(**text_config) - - if vision_config is None: - vision_config = MetaClip2VisionConfig() - logger.info("`vision_config` is `None`. initializing the `MetaClip2VisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = MetaClip2VisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config + # Finally we can convert back our unified text/vision configs to `PretrainedConfig` + self.text_config = MetaClip2TextConfig(**text_config) + self.vision_config = MetaClip2VisionConfig(**vision_config) - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = 1.0 - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["MetaClip2Config", "MetaClip2TextConfig", "MetaClip2VisionConfig"] diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py index b01d7eb52cea..16dd6de15098 100644 --- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py @@ -4,6 +4,21 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_metaclip_2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from collections.abc import Callable from dataclasses import dataclass from typing import Any @@ -177,11 +192,6 @@ def __init__(self, config: MetaClip2VisionConfig | MetaClip2TextConfig): self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout self.is_causal = False @@ -351,6 +361,8 @@ def _init_weights(self, module): init.ones_(module.weight) if isinstance(module, nn.Linear) and module.bias is not None: init.zeros_(module.bias) + if hasattr(module, "logit_scale"): + init.constant_(module.logit_scale, self.config.logit_scale_init_value) class MetaClip2Encoder(nn.Module): diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py index 91db885efa4f..32e2543c10ca 100644 --- a/src/transformers/models/metaclip_2/modular_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py @@ -1,4 +1,20 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import torch +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -31,6 +47,7 @@ @auto_docstring(checkpoint="facebook/metaclip-2-worldwide-huge-quickgelu") +@strict(accept_kwargs=True) class MetaClip2TextConfig(CLIPTextConfig): r""" Example: @@ -50,6 +67,7 @@ class MetaClip2TextConfig(CLIPTextConfig): @auto_docstring(checkpoint="facebook/metaclip-2-worldwide-huge-quickgelu") +@strict(accept_kwargs=True) class MetaClip2VisionConfig(CLIPVisionConfig): r""" Example: @@ -69,6 +87,7 @@ class MetaClip2VisionConfig(CLIPVisionConfig): @auto_docstring(checkpoint="facebook/metaclip-2-worldwide-huge-quickgelu") +@strict(accept_kwargs=True) class MetaClip2Config(CLIPConfig): r""" Example: @@ -174,6 +193,8 @@ def _init_weights(self, module): init.ones_(module.weight) if isinstance(module, nn.Linear) and module.bias is not None: init.zeros_(module.bias) + if hasattr(module, "logit_scale"): + init.constant_(module.logit_scale, self.config.logit_scale_init_value) class MetaClip2TextTransformer(CLIPTextTransformer): diff --git a/src/transformers/models/mgp_str/configuration_mgp_str.py b/src/transformers/models/mgp_str/configuration_mgp_str.py index 3addeffe0252..5cb5a102a2cb 100644 --- a/src/transformers/models/mgp_str/configuration_mgp_str.py +++ b/src/transformers/models/mgp_str/configuration_mgp_str.py @@ -13,14 +13,14 @@ # limitations under the License. """MGP-STR model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="alibaba-damo/mgp-str-base") +@strict(accept_kwargs=True) class MgpstrConfig(PreTrainedConfig): r""" max_token_length (`int`, *optional*, defaults to 27): @@ -57,50 +57,25 @@ class MgpstrConfig(PreTrainedConfig): model_type = "mgp-str" - def __init__( - self, - image_size=[32, 128], - patch_size=4, - num_channels=3, - max_token_length=27, - num_character_labels=38, - num_bpe_labels=50257, - num_wordpiece_labels=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - mlp_ratio=4.0, - qkv_bias=True, - distilled=False, - layer_norm_eps=1e-5, - drop_rate=0.0, - attn_drop_rate=0.0, - drop_path_rate=0.0, - output_a3_attentions=False, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.max_token_length = max_token_length - self.num_character_labels = num_character_labels - self.num_bpe_labels = num_bpe_labels - self.num_wordpiece_labels = num_wordpiece_labels - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mlp_ratio = mlp_ratio - self.distilled = distilled - self.layer_norm_eps = layer_norm_eps - self.drop_rate = drop_rate - self.qkv_bias = qkv_bias - self.attn_drop_rate = attn_drop_rate - self.drop_path_rate = drop_path_rate - self.output_a3_attentions = output_a3_attentions - self.initializer_range = initializer_range + image_size: list[int] | tuple[int, ...] = (32, 128) + patch_size: int | list[int] | tuple[int, int] = 4 + num_channels: int = 3 + max_token_length: int = 27 + num_character_labels: int = 38 + num_bpe_labels: int = 50257 + num_wordpiece_labels: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + mlp_ratio: float | int = 4.0 + qkv_bias: bool = True + distilled: bool = False + layer_norm_eps: float = 1e-5 + drop_rate: float = 0.0 + attn_drop_rate: float = 0.0 + drop_path_rate: float = 0.0 + output_a3_attentions: bool = False + initializer_range: float = 0.02 __all__ = ["MgpstrConfig"] diff --git a/src/transformers/models/mgp_str/modeling_mgp_str.py b/src/transformers/models/mgp_str/modeling_mgp_str.py index d1d5ebcd631a..aac41fcc8cef 100644 --- a/src/transformers/models/mgp_str/modeling_mgp_str.py +++ b/src/transformers/models/mgp_str/modeling_mgp_str.py @@ -326,7 +326,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -422,7 +422,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict mgp_outputs = self.mgp_str( pixel_values, diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 8614bb5da840..d6597032547c 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -16,16 +16,15 @@ import math import numpy as np +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="kyutai/mimi") +@strict(accept_kwargs=True) class MimiConfig(PreTrainedConfig): r""" num_quantizers (`int`, *optional*, defaults to 32): @@ -86,101 +85,61 @@ class MimiConfig(PreTrainedConfig): model_type = "mimi" - def __init__( - self, - sampling_rate: int | None = 24_000, - frame_rate: int | None = None, - audio_channels: int | None = 1, - hidden_size: int | None = 512, - num_filters: int | None = 64, - num_residual_layers: int | None = 1, - upsampling_ratios: list[int] | None = None, - kernel_size: int | None = 7, - last_kernel_size: int | None = 3, - residual_kernel_size: int | None = 3, - dilation_growth_rate: int | None = 2, - use_causal_conv: bool | None = True, - pad_mode: str | None = "constant", - compress: int | None = 2, - trim_right_ratio: float | None = 1.0, - codebook_size: int | None = 2048, - codebook_dim: int | None = 256, - num_quantizers: int | None = 32, - use_conv_shortcut: bool | None = False, - vector_quantization_hidden_dimension: int | None = 256, - num_semantic_quantizers: int | None = 1, - upsample_groups: int | None = 512, - num_hidden_layers: int | None = 8, - intermediate_size: int | None = 2048, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 8, - head_dim: int | None = None, - hidden_act: str | None = "gelu", - max_position_embeddings: int | None = 8000, - initializer_range: float | None = 0.02, - norm_eps: int | None = 1e-5, - use_cache: bool | None = False, - use_streaming: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - sliding_window: int | None = 250, - attention_dropout: float | None = 0.0, - layer_scale_initial_scale: float | None = 0.01, - attention_bias: bool | None = False, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.sampling_rate = sampling_rate - self.audio_channels = audio_channels - self.hidden_size = hidden_size - self.num_filters = num_filters - self.num_residual_layers = num_residual_layers - self.upsampling_ratios = upsampling_ratios if upsampling_ratios else [8, 6, 5, 4] - self.kernel_size = kernel_size - self.last_kernel_size = last_kernel_size - self.residual_kernel_size = residual_kernel_size - self.dilation_growth_rate = dilation_growth_rate - self.use_causal_conv = use_causal_conv - self.pad_mode = pad_mode - self.compress = compress - self.trim_right_ratio = trim_right_ratio - self.codebook_size = codebook_size - self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size - self.num_quantizers = num_quantizers - self.use_conv_shortcut = use_conv_shortcut - self.vector_quantization_hidden_dimension = vector_quantization_hidden_dimension - self.upsample_groups = upsample_groups - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.norm_eps = norm_eps - self.use_cache = use_cache - self.use_streaming = use_streaming - self.sliding_window = sliding_window - self.attention_dropout = attention_dropout - self.head_dim = head_dim or hidden_size // num_attention_heads - self.layer_scale_initial_scale = layer_scale_initial_scale - self.attention_bias = attention_bias - self.tie_word_embeddings = tie_word_embeddings - self.rope_parameters = rope_parameters - + sampling_rate: int = 24_000 + audio_channels: int = 1 + hidden_size: int = 512 + num_filters: int = 64 + num_residual_layers: int = 1 + upsampling_ratios: list[int] | None = None + kernel_size: int = 7 + last_kernel_size: int = 3 + residual_kernel_size: int = 3 + dilation_growth_rate: int = 2 + use_causal_conv: bool = True + pad_mode: str = "constant" + compress: int = 2 + trim_right_ratio: float = 1.0 + codebook_size: int = 2048 + codebook_dim: int = 256 + num_quantizers: int = 32 + use_conv_shortcut: bool = False + vector_quantization_hidden_dimension: int = 256 + num_semantic_quantizers: int = 1 + upsample_groups: int = 512 + num_hidden_layers: int = 8 + intermediate_size: int = 2048 + num_attention_heads: int = 8 + num_key_value_heads: int = 8 + head_dim: int | None = None + hidden_act: str = "gelu" + max_position_embeddings: int = 8000 + initializer_range: float = 0.02 + norm_eps: float = 1e-5 + use_cache: bool = False + use_streaming: bool = False + rope_parameters: RopeParameters | dict | None = None + sliding_window: int = 250 + attention_dropout: float | int = 0.0 + layer_scale_initial_scale: float = 0.01 + attention_bias: bool = False + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.upsampling_ratios = self.upsampling_ratios if self.upsampling_ratios else [8, 6, 5, 4] + self.codebook_dim = self.codebook_dim if self.codebook_dim is not None else self.hidden_size + self.head_dim = self.head_dim or self.hidden_size // self.num_attention_heads # Handle backward compatibility for frame_rate: # If frame_rate is explicitly provided, use it (backward compatibility) # Otherwise, compute it from other parameters (correctly) - if frame_rate is not None: - self._frame_rate = frame_rate - else: - self._frame_rate = None + self._frame_rate = kwargs.pop("frame_rate", None) + super().__post_init__(**kwargs) - if num_semantic_quantizers >= self.num_quantizers: + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.num_semantic_quantizers >= self.num_quantizers: raise ValueError( - f"The number of semantic quantizers should be lower than the total number of quantizers {self.num_quantizers}, but is currently {num_semantic_quantizers}." + f"The number of semantic quantizers should be lower than the total number of quantizers {self.num_quantizers}, but is currently {self.num_semantic_quantizers}." ) - self.num_semantic_quantizers = num_semantic_quantizers - super().__init__(**kwargs) @property def encodec_frame_rate(self) -> int: diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py index e9a18af8433b..7e7422c28409 100644 --- a/src/transformers/models/mimi/modeling_mimi.py +++ b/src/transformers/models/mimi/modeling_mimi.py @@ -1088,7 +1088,7 @@ def forward( ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if self.gradient_checkpointing and self.training and use_cache: logger.warning_once( diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index c54f7358d372..dcbaf4ca2554 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -18,12 +18,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="MiniMaxAI/MiniMax-Text-01-hf") +@strict(accept_kwargs=True) class MiniMaxConfig(PreTrainedConfig): r""" block_size (`int`, *optional*, defaults to 256): @@ -72,93 +75,51 @@ class MiniMaxConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - attribute_map = { - "num_experts": "num_local_experts", - } - - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 14336, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - head_dim: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096 * 32, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - sliding_window: int | None = None, - attention_dropout: float | None = 0.0, - num_experts_per_tok: int | None = 2, - num_local_experts: int | None = 8, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - router_jitter_noise: float | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - layer_types: list[str] | None = None, - block_size: int | None = 256, - full_attn_alpha_factor: int | None = 1, - full_attn_beta_factor: int | None = 1, - linear_attn_alpha_factor: int | None = 1, - linear_attn_beta_factor: int | None = 1, - mlp_alpha_factor: int | None = 1, - mlp_beta_factor: int | None = 1, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.head_dim = head_dim - - self.num_experts_per_tok = num_experts_per_tok - self.num_local_experts = num_local_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.router_jitter_noise = router_jitter_noise - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.layer_types = layer_types - self.block_size = block_size - self.full_attn_alpha_factor = full_attn_alpha_factor - self.full_attn_beta_factor = full_attn_beta_factor - self.linear_attn_alpha_factor = linear_attn_alpha_factor - self.linear_attn_beta_factor = linear_attn_beta_factor - self.mlp_alpha_factor = mlp_alpha_factor - self.mlp_beta_factor = mlp_beta_factor + attribute_map = {"num_experts": "num_local_experts"} + + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + head_dim: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 4096 * 32 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + sliding_window: int | None = None + attention_dropout: float | int = 0.0 + num_experts_per_tok: int = 2 + num_local_experts: int = 8 + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + router_jitter_noise: float = 0.0 + rope_parameters: RopeParameters | dict | None = None + layer_types: list[str] | None = None + block_size: int = 256 + full_attn_alpha_factor: int = 1 + full_attn_beta_factor: int = 1 + linear_attn_alpha_factor: int = 1 + linear_attn_beta_factor: int = 1 + mlp_alpha_factor: int = 1 + mlp_beta_factor: int = 1 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads if self.layer_types is None: self.layer_types = [ "full_attention" if bool((i + 1) % 2) else "linear_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["MiniMaxConfig"] diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 56871bcfa18f..0c9ad631062a 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -16,12 +16,13 @@ import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer @@ -51,6 +52,7 @@ @auto_docstring(checkpoint="MiniMaxAI/MiniMax-Text-01-hf") +@strict(accept_kwargs=True) class MiniMaxConfig(PreTrainedConfig): r""" block_size (`int`, *optional*, defaults to 256): @@ -99,93 +101,51 @@ class MiniMaxConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - attribute_map = { - "num_experts": "num_local_experts", - } - - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 14336, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - head_dim: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096 * 32, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - sliding_window: int | None = None, - attention_dropout: float | None = 0.0, - num_experts_per_tok: int | None = 2, - num_local_experts: int | None = 8, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - router_jitter_noise: float | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - layer_types: list[str] | None = None, - block_size: int | None = 256, - full_attn_alpha_factor: int | None = 1, - full_attn_beta_factor: int | None = 1, - linear_attn_alpha_factor: int | None = 1, - linear_attn_beta_factor: int | None = 1, - mlp_alpha_factor: int | None = 1, - mlp_beta_factor: int | None = 1, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.head_dim = head_dim - - self.num_experts_per_tok = num_experts_per_tok - self.num_local_experts = num_local_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.router_jitter_noise = router_jitter_noise - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.layer_types = layer_types - self.block_size = block_size - self.full_attn_alpha_factor = full_attn_alpha_factor - self.full_attn_beta_factor = full_attn_beta_factor - self.linear_attn_alpha_factor = linear_attn_alpha_factor - self.linear_attn_beta_factor = linear_attn_beta_factor - self.mlp_alpha_factor = mlp_alpha_factor - self.mlp_beta_factor = mlp_beta_factor + attribute_map = {"num_experts": "num_local_experts"} + + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + head_dim: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 4096 * 32 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + sliding_window: int | None = None + attention_dropout: float | int = 0.0 + num_experts_per_tok: int = 2 + num_local_experts: int = 8 + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + router_jitter_noise: float = 0.0 + rope_parameters: RopeParameters | dict | None = None + layer_types: list[str] | None = None + block_size: int = 256 + full_attn_alpha_factor: int = 1 + full_attn_beta_factor: int = 1 + linear_attn_alpha_factor: int = 1 + linear_attn_beta_factor: int = 1 + mlp_alpha_factor: int = 1 + mlp_beta_factor: int = 1 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads if self.layer_types is None: self.layer_types = [ "full_attention" if bool((i + 1) % 2) else "linear_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) class MiniMaxRMSNorm(MixtralRMSNorm): diff --git a/src/transformers/models/minimax_m2/configuration_minimax_m2.py b/src/transformers/models/minimax_m2/configuration_minimax_m2.py index c0518c11dbac..e531f28d9418 100644 --- a/src/transformers/models/minimax_m2/configuration_minimax_m2.py +++ b/src/transformers/models/minimax_m2/configuration_minimax_m2.py @@ -19,12 +19,15 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="MiniMaxAI/MiniMax-Text-01-hf") +@strict(accept_kwargs=True) class MiniMaxM2Config(PreTrainedConfig): r""" Example: @@ -63,59 +66,29 @@ class MiniMaxM2Config(PreTrainedConfig): } default_theta = 5000000.0 - def __init__( - self, - vocab_size: int | None = 200064, - hidden_size: int | None = 3072, - intermediate_size: int | None = 1536, - num_hidden_layers: int | None = 62, - num_attention_heads: int | None = 48, - num_key_value_heads: int | None = 8, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 196608, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-06, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 200034, - eos_token_id: int | None = 200020, - tie_word_embeddings: bool | None = False, - attention_dropout: float | None = 0.0, - num_experts_per_tok: int | None = 8, - num_local_experts: int | None = 256, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - router_jitter_noise: float | None = 0.0, - rope_parameters: RopeParameters | dict[RopeParameters] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.head_dim = head_dim - self.rope_parameters = rope_parameters - - self.num_experts_per_tok = num_experts_per_tok - self.num_local_experts = num_local_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.router_jitter_noise = router_jitter_noise - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + vocab_size: int = 200064 + hidden_size: int = 3072 + intermediate_size: int = 1536 + num_hidden_layers: int = 62 + num_attention_heads: int = 48 + num_key_value_heads: int = 8 + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 196608 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-06 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 200034 + eos_token_id: int | list[int] | None = 200020 + tie_word_embeddings: bool = False + attention_dropout: float | int = 0.0 + num_experts_per_tok: int = 8 + num_local_experts: int = 256 + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + router_jitter_noise: float = 0.0 + rope_parameters: RopeParameters | dict | None = None __all__ = ["MiniMaxM2Config"] diff --git a/src/transformers/models/minimax_m2/modular_minimax_m2.py b/src/transformers/models/minimax_m2/modular_minimax_m2.py index 1623321e2ba6..8d18dbebcfb3 100644 --- a/src/transformers/models/minimax_m2/modular_minimax_m2.py +++ b/src/transformers/models/minimax_m2/modular_minimax_m2.py @@ -15,6 +15,7 @@ import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -45,6 +46,7 @@ @auto_docstring(checkpoint="MiniMaxAI/MiniMax-Text-01-hf") +@strict(accept_kwargs=True) class MiniMaxM2Config(PreTrainedConfig): r""" Example: @@ -83,59 +85,29 @@ class MiniMaxM2Config(PreTrainedConfig): } default_theta = 5000000.0 - def __init__( - self, - vocab_size: int | None = 200064, - hidden_size: int | None = 3072, - intermediate_size: int | None = 1536, - num_hidden_layers: int | None = 62, - num_attention_heads: int | None = 48, - num_key_value_heads: int | None = 8, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 196608, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-06, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 200034, - eos_token_id: int | None = 200020, - tie_word_embeddings: bool | None = False, - attention_dropout: float | None = 0.0, - num_experts_per_tok: int | None = 8, - num_local_experts: int | None = 256, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - router_jitter_noise: float | None = 0.0, - rope_parameters: RopeParameters | dict[RopeParameters] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.head_dim = head_dim - self.rope_parameters = rope_parameters - - self.num_experts_per_tok = num_experts_per_tok - self.num_local_experts = num_local_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.router_jitter_noise = router_jitter_noise - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + vocab_size: int = 200064 + hidden_size: int = 3072 + intermediate_size: int = 1536 + num_hidden_layers: int = 62 + num_attention_heads: int = 48 + num_key_value_heads: int = 8 + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 196608 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-06 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 200034 + eos_token_id: int | list[int] | None = 200020 + tie_word_embeddings: bool = False + attention_dropout: float | int = 0.0 + num_experts_per_tok: int = 8 + num_local_experts: int = 256 + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + router_jitter_noise: float = 0.0 + rope_parameters: RopeParameters | dict | None = None class MiniMaxM2TopKRouter(MixtralTopKRouter): diff --git a/src/transformers/models/ministral/configuration_ministral.py b/src/transformers/models/ministral/configuration_ministral.py index 1b30af15b85c..2b1275ef44bb 100644 --- a/src/transformers/models/ministral/configuration_ministral.py +++ b/src/transformers/models/ministral/configuration_ministral.py @@ -4,12 +4,30 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_ministral.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="mistralai/Ministral-8B-Instruct-2410") +@strict(accept_kwargs=True) class MinistralConfig(PreTrainedConfig): r""" Example: @@ -45,63 +63,38 @@ class MinistralConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 14336, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - head_dim: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096 * 32, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | None = None, - sliding_window: int | None = 4096, - attention_dropout: float | None = 0.0, - layer_types: list[str] | None = None, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - self.head_dim = head_dim - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.layer_types = layer_types + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + head_dim: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 4096 * 32 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + sliding_window: int | None = 4096 + attention_dropout: float | int = 0.0 + + layer_types: list[str] | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads if self.layer_types is None: self.layer_types = [ "sliding_attention" if self.sliding_window is not None else "full_attention" - ] * num_hidden_layers - - self.rope_parameters = rope_parameters + ] * self.num_hidden_layers - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["MinistralConfig"] diff --git a/src/transformers/models/ministral/modeling_ministral.py b/src/transformers/models/ministral/modeling_ministral.py index d9856ca49694..57f39907610f 100644 --- a/src/transformers/models/ministral/modeling_ministral.py +++ b/src/transformers/models/ministral/modeling_ministral.py @@ -4,6 +4,21 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_ministral.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from collections.abc import Callable from typing import Optional diff --git a/src/transformers/models/ministral/modular_ministral.py b/src/transformers/models/ministral/modular_ministral.py index 0c27cfb5316b..2e433d62e9d6 100644 --- a/src/transformers/models/ministral/modular_ministral.py +++ b/src/transformers/models/ministral/modular_ministral.py @@ -1,11 +1,26 @@ +# Copyright 2025 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import torch +from huggingface_hub.dataclasses import strict from torch import nn from ...cache_utils import Cache, DynamicCache from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import merge_with_config_defaults @@ -27,7 +42,8 @@ @auto_docstring(checkpoint="mistralai/Ministral-8B-Instruct-2410") -class MinistralConfig(MistralConfig, PreTrainedConfig): +@strict(accept_kwargs=True) +class MinistralConfig(MistralConfig): r""" Example: @@ -46,63 +62,18 @@ class MinistralConfig(MistralConfig, PreTrainedConfig): model_type = "ministral" - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 14336, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - head_dim: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096 * 32, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | None = None, - sliding_window: int | None = 4096, - attention_dropout: float | None = 0.0, - layer_types: list[str] | None = None, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - self.head_dim = head_dim - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.layer_types = layer_types + layer_types: list[str] | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads if self.layer_types is None: self.layer_types = [ "sliding_attention" if self.sliding_window is not None else "full_attention" - ] * num_hidden_layers - - self.rope_parameters = rope_parameters + ] * self.num_hidden_layers - PreTrainedConfig.__init__(self, **kwargs) + PreTrainedConfig.__post_init__(self, **kwargs) class MinistralMLP(Qwen2MLP): diff --git a/src/transformers/models/ministral3/configuration_ministral3.py b/src/transformers/models/ministral3/configuration_ministral3.py index 6ffb51159bd7..06cc3aacc5d0 100644 --- a/src/transformers/models/ministral3/configuration_ministral3.py +++ b/src/transformers/models/ministral3/configuration_ministral3.py @@ -13,6 +13,8 @@ # limitations under the License. """Ministral model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -22,6 +24,7 @@ @auto_docstring(checkpoint="mistralai/Ministral-3-8B-Base-2512") +@strict(accept_kwargs=True) class Ministral3Config(PreTrainedConfig): r""" Example: @@ -65,37 +68,36 @@ class Ministral3Config(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - - def __init__( - self, - vocab_size: int | None = 131072, - hidden_size: int | None = 4096, - intermediate_size: int | None = 14336, - num_hidden_layers: int | None = 34, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 262144, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = 11, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - sliding_window: int | None = None, - attention_dropout: float | None = 0.0, - **kwargs, - ): - if rope_parameters is None: - rope_parameters = { + ignore_keys_at_rope_validation = {"llama_4_scaling_beta", "max_position_embeddings"} + + vocab_size: int = 131072 + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 34 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 262144 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = 11 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + sliding_window: int | None = None + attention_dropout: float | int = 0.0 + + def __post_init__(self, **kwargs): + if self.rope_parameters is None: + self.rope_parameters = { "type": "yarn", "rope_theta": 1000000.0, "factor": 16.0, "original_max_position_embeddings": 16384, - "max_position_embeddings": max_position_embeddings, + "max_position_embeddings": self.max_position_embeddings, "beta_fast": 32.0, "beta_slow": 1.0, "mscale_all_dim": 1.0, @@ -103,41 +105,17 @@ def __init__( "llama_4_scaling_beta": 0.1, } - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - self.head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads if "layer_types" in kwargs: logger.warning_once( "Detected Mistral model with layer_types. Consider using AutoModel or Ministral classes instead to enable alternating attention compatibility." ) - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__( - ignore_keys_at_rope_validation={"llama_4_scaling_beta", "max_position_embeddings"}, - **kwargs, - ) + super().__post_init__(**kwargs) __all__ = ["Ministral3Config"] diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index 7bc24445ff90..c7b2a8dfbdcb 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -13,6 +13,8 @@ # limitations under the License. """Mistral model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -22,6 +24,7 @@ @auto_docstring(checkpoint="mistralai/Mistral-7B-v0.1") +@strict(accept_kwargs=True) class MistralConfig(PreTrainedConfig): r""" Example: @@ -57,61 +60,36 @@ class MistralConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 14336, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - head_dim: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096 * 32, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - sliding_window: int | None = 4096, - attention_dropout: float | None = 0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - self.head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + head_dim: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 4096 * 32 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + sliding_window: int | None = 4096 + attention_dropout: float | int = 0.0 + + def __post_init__(self, **kwargs): + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads if "layer_types" in kwargs: logger.warning_once( "Detected Mistral model with layer_types. Consider using AutoModel or Ministral classes instead to enable alternating attention compatibility." ) - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + return super().__post_init__(**kwargs) __all__ = ["MistralConfig"] diff --git a/src/transformers/models/mistral3/configuration_mistral3.py b/src/transformers/models/mistral3/configuration_mistral3.py index 5b087b658b2c..3eb2b6126389 100644 --- a/src/transformers/models/mistral3/configuration_mistral3.py +++ b/src/transformers/models/mistral3/configuration_mistral3.py @@ -13,12 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="mistralai/Mistral-Small-3.1-24B-Instruct-2503") +@strict(accept_kwargs=True) class Mistral3Config(PreTrainedConfig): r""" Example: @@ -49,28 +53,21 @@ class Mistral3Config(PreTrainedConfig): sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} is_composition = True - def __init__( - self, - vision_config=None, - text_config=None, - image_token_index=10, - projector_hidden_act="gelu", - vision_feature_layer=-1, - multimodal_projector_bias=False, - spatial_merge_size=2, - tie_word_embeddings: bool = True, - **kwargs, - ): - self.image_token_index = image_token_index - self.projector_hidden_act = projector_hidden_act - - self.vision_feature_layer = vision_feature_layer - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "pixtral") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["pixtral"]( + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 10 + projector_hidden_act: str = "gelu" + vision_feature_layer: int | list[int] = -1 + multimodal_projector_bias: bool = False + spatial_merge_size: int = 2 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "pixtral") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["pixtral"]( intermediate_size=4096, hidden_size=1024, patch_size=14, @@ -82,13 +79,11 @@ def __init__( hidden_act="gelu", ) - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "mistral") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["mistral"]( + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "mistral") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["mistral"]( attention_dropout=0.0, head_dim=128, hidden_act="silu", @@ -107,12 +102,7 @@ def __init__( vocab_size=131072, ) - self.text_config = text_config - self.multimodal_projector_bias = multimodal_projector_bias - self.spatial_merge_size = spatial_merge_size - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Mistral3Config"] diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py index 52cca1b5c43d..7484af287977 100644 --- a/src/transformers/models/mistral3/modeling_mistral3.py +++ b/src/transformers/models/mistral3/modeling_mistral3.py @@ -224,7 +224,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -290,7 +290,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, use_cache: bool | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], @@ -368,7 +368,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: return self.model.get_image_features( diff --git a/src/transformers/models/mistral3/modular_mistral3.py b/src/transformers/models/mistral3/modular_mistral3.py index 9ef463dc7a1f..cc125b7d217e 100644 --- a/src/transformers/models/mistral3/modular_mistral3.py +++ b/src/transformers/models/mistral3/modular_mistral3.py @@ -129,7 +129,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -171,7 +171,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, use_cache: bool | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], @@ -221,7 +221,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: return self.model.get_image_features( diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index fe59aa3b721c..c65e7f67439b 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -13,15 +13,15 @@ # limitations under the License. """Mixtral model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="mistralai/Mixtral-8x7B-v0.1") +@strict(accept_kwargs=True) class MixtralConfig(PreTrainedConfig): r""" Example: @@ -56,70 +56,38 @@ class MixtralConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - attribute_map = { - "num_experts": "num_local_experts", - } - - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 14336, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - head_dim: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096 * 32, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = False, - sliding_window: int | None = None, - attention_dropout: float | None = 0.0, - num_experts_per_tok: int | None = 2, - num_local_experts: int | None = 8, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - router_jitter_noise: float | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.head_dim = head_dim - - self.num_experts_per_tok = num_experts_per_tok - self.num_local_experts = num_local_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.router_jitter_noise = router_jitter_noise - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + attribute_map = {"num_experts": "num_local_experts"} + + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 14336 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + head_dim: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 4096 * 32 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = False + sliding_window: int | None = None + attention_dropout: float | int = 0.0 + num_experts_per_tok: int = 2 + num_local_experts: int = 8 + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + router_jitter_noise: float = 0.0 + rope_parameters: RopeParameters | dict | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) __all__ = ["MixtralConfig"] diff --git a/src/transformers/models/mlcd/configuration_mlcd.py b/src/transformers/models/mlcd/configuration_mlcd.py index 88e7a7233b8c..598142ee984b 100644 --- a/src/transformers/models/mlcd/configuration_mlcd.py +++ b/src/transformers/models/mlcd/configuration_mlcd.py @@ -17,11 +17,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="DeepGlint-AI/mlcd-vit-bigG-patch14-336") +@strict(accept_kwargs=True) class MLCDVisionConfig(PreTrainedConfig): r""" num_key_value_groups (`int`, *optional*, defaults to 1): @@ -45,38 +48,19 @@ class MLCDVisionConfig(PreTrainedConfig): model_type = "mlcd_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1664, - intermediate_size=8192, - num_hidden_layers=48, - num_attention_heads=16, - num_key_value_groups=1, - num_channels=3, - image_size=336, - patch_size=14, - hidden_act="gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_groups = num_key_value_groups - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act + hidden_size: int = 1664 + intermediate_size: int = 8192 + num_hidden_layers: int = 48 + num_attention_heads: int = 16 + num_key_value_groups: int = 1 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 336 + patch_size: int | list[int] | tuple[int, int] = 14 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 __all__ = ["MLCDVisionConfig"] diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py index 8d0ec26c23cf..efc0bb807d2d 100644 --- a/src/transformers/models/mlcd/modeling_mlcd.py +++ b/src/transformers/models/mlcd/modeling_mlcd.py @@ -240,11 +240,6 @@ def __init__(self, config: MLCDVisionConfig): self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout self.is_causal = False diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py index d5fbc7f31187..315dfd407059 100644 --- a/src/transformers/models/mlcd/modular_mlcd.py +++ b/src/transformers/models/mlcd/modular_mlcd.py @@ -15,6 +15,7 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...configuration_utils import PreTrainedConfig @@ -39,6 +40,7 @@ @auto_docstring(checkpoint="DeepGlint-AI/mlcd-vit-bigG-patch14-336") +@strict(accept_kwargs=True) class MLCDVisionConfig(PreTrainedConfig): r""" num_key_value_groups (`int`, *optional*, defaults to 1): @@ -62,38 +64,19 @@ class MLCDVisionConfig(PreTrainedConfig): model_type = "mlcd_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1664, - intermediate_size=8192, - num_hidden_layers=48, - num_attention_heads=16, - num_key_value_groups=1, - num_channels=3, - image_size=336, - patch_size=14, - hidden_act="gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_groups = num_key_value_groups - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act + hidden_size: int = 1664 + intermediate_size: int = 8192 + num_hidden_layers: int = 48 + num_attention_heads: int = 16 + num_key_value_groups: int = 1 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 336 + patch_size: int | list[int] | tuple[int, int] = 14 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 class MLCDMLP(CLIPMLP): diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index a3e82ac9edae..553ed3711798 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -12,6 +12,8 @@ # limitations under the License. """Mllama model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -20,6 +22,7 @@ @auto_docstring(checkpoint="meta-llama/Llama-3.2-11B-Vision") +@strict(accept_kwargs=True) class MllamaVisionConfig(PreTrainedConfig): r""" num_global_layers (`int`, *optional*, defaults to 8): @@ -53,50 +56,39 @@ class MllamaVisionConfig(PreTrainedConfig): model_type = "mllama_vision_model" base_config_key = "vision_config" - - def __init__( - self, - hidden_size: int = 1280, - hidden_act: str = "gelu", - num_hidden_layers: int = 32, - num_global_layers: int = 8, - num_attention_heads: int = 16, - num_channels: int = 3, - intermediate_size: int = 5120, - vision_output_dim: int = 7680, - image_size: int = 448, - patch_size: int = 14, - norm_eps: float = 1e-5, - max_num_tiles: int = 4, - intermediate_layers_indices: list[int] | None = None, - supported_aspect_ratios: list[list[int]] | None = None, - initializer_range: float = 0.02, - **kwargs, - ): - if supported_aspect_ratios is None: - if max_num_tiles != 4: - raise ValueError("max_num_tiles must be 4 for default supported aspect ratios") - supported_aspect_ratios = [[1, 1], [1, 2], [1, 3], [1, 4], [2, 1], [2, 2], [3, 1], [4, 1]] - - if intermediate_layers_indices is None: - intermediate_layers_indices = [3, 7, 15, 23, 30] - - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.num_hidden_layers = num_hidden_layers - self.num_channels = num_channels - self.intermediate_size = intermediate_size - self.image_size = image_size - self.vision_output_dim = vision_output_dim - self.patch_size = patch_size - self.intermediate_layers_indices = intermediate_layers_indices - self.num_global_layers = num_global_layers - self.max_num_tiles = max_num_tiles - self.norm_eps = norm_eps - self.attention_heads = num_attention_heads - self.supported_aspect_ratios = supported_aspect_ratios - self.initializer_range = initializer_range - super().__init__(**kwargs) + attribute_map = {"num_attention_heads": "attention_heads"} + + hidden_size: int = 1280 + hidden_act: str = "gelu" + num_hidden_layers: int = 32 + num_global_layers: int = 8 + attention_heads: int = 16 + num_channels: int = 3 + intermediate_size: int = 5120 + vision_output_dim: int = 7680 + image_size: int | list[int] | tuple[int, int] = 448 + patch_size: int | list[int] | tuple[int, int] = 14 + norm_eps: float = 1e-5 + max_num_tiles: int = 4 + intermediate_layers_indices: list[int] | None = None + supported_aspect_ratios: list[list[int]] | None = None + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.supported_aspect_ratios is None: + self.supported_aspect_ratios = [[1, 1], [1, 2], [1, 3], [1, 4], [2, 1], [2, 2], [3, 1], [4, 1]] + + if self.intermediate_layers_indices is None: + self.intermediate_layers_indices = [3, 7, 15, 23, 30] + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if ( + self.supported_aspect_ratios == [[1, 1], [1, 2], [1, 3], [1, 4], [2, 1], [2, 2], [3, 1], [4, 1]] + and self.max_num_tiles != 4 + ): + raise ValueError("max_num_tiles must be 4 for default supported aspect ratios") @property def max_aspect_ratio_id(self) -> int: @@ -104,6 +96,7 @@ def max_aspect_ratio_id(self) -> int: @auto_docstring(checkpoint="meta-llama/Llama-3.2-11B-Vision") +@strict(accept_kwargs=True) class MllamaTextConfig(PreTrainedConfig): r""" cross_attention_layers (`list[int]`, *optional*): @@ -128,54 +121,33 @@ class MllamaTextConfig(PreTrainedConfig): base_config_key = "text_config" default_theta = 500000.0 - def __init__( - self, - vocab_size: int = 128256, - hidden_size: int = 4096, - hidden_act: str = "silu", - num_hidden_layers: int = 40, - num_attention_heads: int = 32, - num_key_value_heads: int = 8, - intermediate_size: int = 14_336, - rope_parameters: dict | None = None, - rms_norm_eps: float = 1e-5, - max_position_embeddings: int = 131_072, - initializer_range: float = 0.02, - use_cache: bool = True, - tie_word_embeddings: bool = False, - cross_attention_layers: list[int] | None = None, - dropout: float = 0, - bos_token_id: int = 128000, - eos_token_id: int = 128001, - pad_token_id: int | None = 128004, - **kwargs, - ): - if cross_attention_layers is None: - cross_attention_layers = [3, 8, 13, 18, 23, 28, 33, 38] - - self.vocab_size = vocab_size - self.num_hidden_layers = num_hidden_layers - self.cross_attention_layers = cross_attention_layers - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.use_cache = use_cache - self.rms_norm_eps = rms_norm_eps - self.intermediate_size = intermediate_size - self.dropout = dropout - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 128256 + hidden_size: int = 4096 + hidden_act: str = "silu" + num_hidden_layers: int = 40 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + intermediate_size: int = 14_336 + rope_parameters: dict | None = None + rms_norm_eps: float = 1e-5 + max_position_embeddings: int = 131_072 + initializer_range: float = 0.02 + use_cache: bool = True + tie_word_embeddings: bool = False + cross_attention_layers: list[int] | None = None + dropout: float | int = 0.0 + bos_token_id: int = 128000 + eos_token_id: int | list[int] | None = 128001 + pad_token_id: int | None = 128004 + + def __post_init__(self, **kwargs): + if self.cross_attention_layers is None: + self.cross_attention_layers = [3, 8, 13, 18, 23, 28, 33, 38] + super().__post_init__(**kwargs) @auto_docstring(checkpoint="meta-llama/Llama-3.2-11B-Vision") +@strict(accept_kwargs=True) class MllamaConfig(PreTrainedConfig): r""" Example: @@ -205,32 +177,24 @@ class MllamaConfig(PreTrainedConfig): } sub_configs = {"text_config": MllamaTextConfig, "vision_config": MllamaVisionConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_index=128256, - **kwargs, - ): - if vision_config is None: + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 128256 + + def __post_init__(self, **kwargs): + if self.vision_config is None: self.vision_config = MllamaVisionConfig() logger.info("vision_config is None, using default mllama vision config") - elif isinstance(vision_config, dict): - self.vision_config = MllamaVisionConfig(**vision_config) - elif isinstance(vision_config, MllamaVisionConfig): - self.vision_config = vision_config - - self.image_token_index = image_token_index + elif isinstance(self.vision_config, dict): + self.vision_config = MllamaVisionConfig(**self.vision_config) - if text_config is None: + if self.text_config is None: self.text_config = MllamaTextConfig() logger.info("text_config is None, using default mllama text config") - elif isinstance(text_config, dict): - self.text_config = MllamaTextConfig(**text_config) - elif isinstance(text_config, MllamaTextConfig): - self.text_config = text_config + elif isinstance(self.text_config, dict): + self.text_config = MllamaTextConfig(**self.text_config) - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["MllamaConfig"] diff --git a/src/transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py b/src/transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py index b86feb00ab39..4f7734d8457f 100644 --- a/src/transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +++ b/src/transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py @@ -17,6 +17,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -27,6 +29,7 @@ @auto_docstring(checkpoint="openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det") +@strict(accept_kwargs=True) class MMGroundingDinoConfig(PreTrainedConfig): r""" num_queries (`int`, *optional*, defaults to 900): @@ -85,110 +88,62 @@ class MMGroundingDinoConfig(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - backbone_config=None, - text_config=None, - num_queries=900, - encoder_layers=6, - encoder_ffn_dim=2048, - encoder_attention_heads=8, - decoder_layers=6, - decoder_ffn_dim=2048, - decoder_attention_heads=8, - is_encoder_decoder=True, - activation_function="relu", - d_model=256, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - auxiliary_loss=False, - position_embedding_type="sine", - num_feature_levels=4, - encoder_n_points=4, - decoder_n_points=4, - two_stage=True, - class_cost=1.0, - bbox_cost=5.0, - giou_cost=2.0, - bbox_loss_coefficient=5.0, - giou_loss_coefficient=2.0, - focal_alpha=0.25, - disable_custom_kernels=False, - # other parameters - max_text_len=256, - text_enhancer_dropout=0.0, - fusion_droppath=0.1, - fusion_dropout=0.0, - embedding_init_target=True, - query_dim=4, - positional_embedding_temperature=20, - init_std=0.02, - layer_norm_eps=1e-5, - tie_word_embeddings=True, - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + num_queries: int = 900 + encoder_layers: int = 6 + encoder_ffn_dim: int = 2048 + encoder_attention_heads: int = 8 + decoder_layers: int = 6 + decoder_ffn_dim: int = 2048 + decoder_attention_heads: int = 8 + is_encoder_decoder: bool = True + activation_function: str = "relu" + d_model: int = 256 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + auxiliary_loss: bool = False + position_embedding_type: str = "sine" + num_feature_levels: int = 4 + encoder_n_points: int = 4 + decoder_n_points: int = 4 + two_stage: bool = True + class_cost: float = 1.0 + bbox_cost: float = 5.0 + giou_cost: float = 2.0 + bbox_loss_coefficient: float = 5.0 + giou_loss_coefficient: float = 2.0 + focal_alpha: float = 0.25 + disable_custom_kernels: bool = False + max_text_len: int = 256 + text_enhancer_dropout: float | int = 0.0 + fusion_droppath: float = 0.1 + fusion_dropout: float | int = 0.0 + embedding_init_target: bool = True + query_dim: int = 4 + positional_embedding_temperature: int = 20 + init_std: float = 0.02 + layer_norm_eps: float = 1e-5 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="swin", default_config_kwargs={"out_indices": [2, 3, 4]}, **kwargs, ) - self.backbone_config = backbone_config - self.num_queries = num_queries - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.auxiliary_loss = auxiliary_loss - self.position_embedding_type = position_embedding_type - # deformable attributes - self.num_feature_levels = num_feature_levels - self.encoder_n_points = encoder_n_points - self.decoder_n_points = decoder_n_points - self.two_stage = two_stage - # Hungarian matcher - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - # Loss coefficients - self.bbox_loss_coefficient = bbox_loss_coefficient - self.giou_loss_coefficient = giou_loss_coefficient - self.focal_alpha = focal_alpha - self.disable_custom_kernels = disable_custom_kernels # Text backbone - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "bert") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "bert") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).") - text_config = CONFIG_MAPPING["bert"]() - - self.text_config = text_config - self.max_text_len = max_text_len - - # Text Enhancer - self.text_enhancer_dropout = text_enhancer_dropout - # Fusion - self.fusion_droppath = fusion_droppath - self.fusion_dropout = fusion_dropout - # Others - self.embedding_init_target = embedding_init_target - self.query_dim = query_dim - self.positional_embedding_temperature = positional_embedding_temperature - self.init_std = init_std - self.layer_norm_eps = layer_norm_eps - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + self.text_config = CONFIG_MAPPING["bert"]() + + super().__post_init__(**kwargs) __all__ = ["MMGroundingDinoConfig"] diff --git a/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py b/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py index 6acaccf70023..e037ce850fe8 100644 --- a/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +++ b/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py @@ -1197,7 +1197,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict reference_points = self.get_reference_points(spatial_shapes_list, valid_ratios, device=vision_features.device) @@ -1490,7 +1490,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if inputs_embeds is not None: hidden_states = inputs_embeds @@ -1971,7 +1971,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids) @@ -2471,7 +2471,7 @@ def forward( Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44] Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if attention_mask is None: attention_mask = torch.ones_like(input_ids) diff --git a/src/transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py b/src/transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py index 8f910c026525..42c3bd00e9e6 100644 --- a/src/transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +++ b/src/transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py @@ -14,6 +14,7 @@ import math import torch +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -40,6 +41,7 @@ @auto_docstring(checkpoint="openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det") +@strict(accept_kwargs=True) class MMGroundingDinoConfig(PreTrainedConfig): r""" num_queries (`int`, *optional*, defaults to 900): @@ -98,110 +100,62 @@ class MMGroundingDinoConfig(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - backbone_config=None, - text_config=None, - num_queries=900, - encoder_layers=6, - encoder_ffn_dim=2048, - encoder_attention_heads=8, - decoder_layers=6, - decoder_ffn_dim=2048, - decoder_attention_heads=8, - is_encoder_decoder=True, - activation_function="relu", - d_model=256, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - auxiliary_loss=False, - position_embedding_type="sine", - num_feature_levels=4, - encoder_n_points=4, - decoder_n_points=4, - two_stage=True, - class_cost=1.0, - bbox_cost=5.0, - giou_cost=2.0, - bbox_loss_coefficient=5.0, - giou_loss_coefficient=2.0, - focal_alpha=0.25, - disable_custom_kernels=False, - # other parameters - max_text_len=256, - text_enhancer_dropout=0.0, - fusion_droppath=0.1, - fusion_dropout=0.0, - embedding_init_target=True, - query_dim=4, - positional_embedding_temperature=20, - init_std=0.02, - layer_norm_eps=1e-5, - tie_word_embeddings=True, - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + num_queries: int = 900 + encoder_layers: int = 6 + encoder_ffn_dim: int = 2048 + encoder_attention_heads: int = 8 + decoder_layers: int = 6 + decoder_ffn_dim: int = 2048 + decoder_attention_heads: int = 8 + is_encoder_decoder: bool = True + activation_function: str = "relu" + d_model: int = 256 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + auxiliary_loss: bool = False + position_embedding_type: str = "sine" + num_feature_levels: int = 4 + encoder_n_points: int = 4 + decoder_n_points: int = 4 + two_stage: bool = True + class_cost: float = 1.0 + bbox_cost: float = 5.0 + giou_cost: float = 2.0 + bbox_loss_coefficient: float = 5.0 + giou_loss_coefficient: float = 2.0 + focal_alpha: float = 0.25 + disable_custom_kernels: bool = False + max_text_len: int = 256 + text_enhancer_dropout: float | int = 0.0 + fusion_droppath: float = 0.1 + fusion_dropout: float | int = 0.0 + embedding_init_target: bool = True + query_dim: int = 4 + positional_embedding_temperature: int = 20 + init_std: float = 0.02 + layer_norm_eps: float = 1e-5 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="swin", default_config_kwargs={"out_indices": [2, 3, 4]}, **kwargs, ) - self.backbone_config = backbone_config - self.num_queries = num_queries - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.auxiliary_loss = auxiliary_loss - self.position_embedding_type = position_embedding_type - # deformable attributes - self.num_feature_levels = num_feature_levels - self.encoder_n_points = encoder_n_points - self.decoder_n_points = decoder_n_points - self.two_stage = two_stage - # Hungarian matcher - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - # Loss coefficients - self.bbox_loss_coefficient = bbox_loss_coefficient - self.giou_loss_coefficient = giou_loss_coefficient - self.focal_alpha = focal_alpha - self.disable_custom_kernels = disable_custom_kernels # Text backbone - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "bert") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "bert") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).") - text_config = CONFIG_MAPPING["bert"]() - - self.text_config = text_config - self.max_text_len = max_text_len - - # Text Enhancer - self.text_enhancer_dropout = text_enhancer_dropout - # Fusion - self.fusion_droppath = fusion_droppath - self.fusion_dropout = fusion_dropout - # Others - self.embedding_init_target = embedding_init_target - self.query_dim = query_dim - self.positional_embedding_temperature = positional_embedding_temperature - self.init_std = init_std - self.layer_norm_eps = layer_norm_eps - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + self.text_config = CONFIG_MAPPING["bert"]() + + super().__post_init__(**kwargs) class MMGroundingDinoContrastiveEmbedding(GroundingDinoContrastiveEmbedding): diff --git a/src/transformers/models/mobilebert/configuration_mobilebert.py b/src/transformers/models/mobilebert/configuration_mobilebert.py index 8336c9ed6f4d..ca582d7f5d67 100644 --- a/src/transformers/models/mobilebert/configuration_mobilebert.py +++ b/src/transformers/models/mobilebert/configuration_mobilebert.py @@ -13,14 +13,14 @@ # limitations under the License. """MobileBERT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/mobilebert-uncased") +@strict(accept_kwargs=True) class MobileBertConfig(PreTrainedConfig): r""" embedding_size (`int`, *optional*, defaults to 128): @@ -58,66 +58,37 @@ class MobileBertConfig(PreTrainedConfig): model_type = "mobilebert" - def __init__( - self, - vocab_size=30522, - hidden_size=512, - num_hidden_layers=24, - num_attention_heads=4, - intermediate_size=512, - hidden_act="relu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - embedding_size=128, - trigram_input=True, - use_bottleneck=True, - intra_bottleneck_size=128, - use_bottleneck_attention=False, - key_query_shared_bottleneck=True, - num_feedforward_networks=4, - normalization_type="no_norm", - classifier_activation=True, - classifier_dropout=None, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.embedding_size = embedding_size - self.trigram_input = trigram_input - self.use_bottleneck = use_bottleneck - self.intra_bottleneck_size = intra_bottleneck_size - self.use_bottleneck_attention = use_bottleneck_attention - self.key_query_shared_bottleneck = key_query_shared_bottleneck - self.num_feedforward_networks = num_feedforward_networks - self.normalization_type = normalization_type - self.classifier_activation = classifier_activation - + vocab_size: int = 30522 + hidden_size: int = 512 + num_hidden_layers: int = 24 + num_attention_heads: int = 4 + intermediate_size: int = 512 + hidden_act: str = "relu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + embedding_size: int = 128 + trigram_input: bool = True + use_bottleneck: bool = True + intra_bottleneck_size: int = 128 + use_bottleneck_attention: bool = False + key_query_shared_bottleneck: bool = True + num_feedforward_networks: int = 4 + normalization_type: str = "no_norm" + classifier_activation: bool = True + classifier_dropout: float | int | None = None + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): if self.use_bottleneck: - self.true_hidden_size = intra_bottleneck_size + self.true_hidden_size = self.intra_bottleneck_size else: - self.true_hidden_size = hidden_size - - self.classifier_dropout = classifier_dropout + self.true_hidden_size = self.hidden_size + super().__post_init__(**kwargs) __all__ = ["MobileBertConfig"] diff --git a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py index 507e35f0aebe..3e64b4ec7d38 100644 --- a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +++ b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py @@ -13,14 +13,14 @@ # limitations under the License. """MobileNetV1 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/mobilenet_v1_1.0_224") +@strict(accept_kwargs=True) class MobileNetV1Config(PreTrainedConfig): r""" min_depth (`int`, *optional*, defaults to 8): @@ -45,33 +45,20 @@ class MobileNetV1Config(PreTrainedConfig): model_type = "mobilenet_v1" - def __init__( - self, - num_channels=3, - image_size=224, - depth_multiplier=1.0, - min_depth=8, - hidden_act="relu6", - tf_padding=True, - classifier_dropout_prob=0.999, - initializer_range=0.02, - layer_norm_eps=0.001, - **kwargs, - ): - super().__init__(**kwargs) - - if depth_multiplier <= 0: + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + depth_multiplier: float = 1.0 + min_depth: int = 8 + hidden_act: str = "relu6" + tf_padding: bool = True + classifier_dropout_prob: float = 0.999 + initializer_range: float = 0.02 + layer_norm_eps: float = 0.001 + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.depth_multiplier <= 0: raise ValueError("depth_multiplier must be greater than zero.") - self.num_channels = num_channels - self.image_size = image_size - self.depth_multiplier = depth_multiplier - self.min_depth = min_depth - self.hidden_act = hidden_act - self.tf_padding = tf_padding - self.classifier_dropout_prob = classifier_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - __all__ = ["MobileNetV1Config"] diff --git a/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py index 69ec4785f5d7..8524248f7796 100755 --- a/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +++ b/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py @@ -197,7 +197,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -266,7 +266,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mobilenet_v1(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) diff --git a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py index 2e7b9f641ff2..7eb7439fb24c 100644 --- a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py @@ -13,14 +13,14 @@ # limitations under the License. """MobileNetV2 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/mobilenet_v2_1.0_224") +@strict(accept_kwargs=True) class MobileNetV2Config(PreTrainedConfig): r""" depth_divisible_by (`int`, *optional*, defaults to 8): @@ -59,45 +59,26 @@ class MobileNetV2Config(PreTrainedConfig): model_type = "mobilenet_v2" - def __init__( - self, - num_channels=3, - image_size=224, - depth_multiplier=1.0, - depth_divisible_by=8, - min_depth=8, - expand_ratio=6.0, - output_stride=32, - first_layer_is_expansion=True, - finegrained_output=True, - hidden_act="relu6", - tf_padding=True, - classifier_dropout_prob=0.8, - initializer_range=0.02, - layer_norm_eps=0.001, - semantic_loss_ignore_index=255, - **kwargs, - ): - super().__init__(**kwargs) - - if depth_multiplier <= 0: + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + depth_multiplier: float = 1.0 + depth_divisible_by: int = 8 + min_depth: int = 8 + expand_ratio: float | int = 6.0 + output_stride: int = 32 + first_layer_is_expansion: bool = True + finegrained_output: bool = True + hidden_act: str = "relu6" + tf_padding: bool = True + classifier_dropout_prob: float = 0.8 + initializer_range: float = 0.02 + layer_norm_eps: float = 0.001 + semantic_loss_ignore_index: int = 255 + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.depth_multiplier <= 0: raise ValueError("depth_multiplier must be greater than zero.") - self.num_channels = num_channels - self.image_size = image_size - self.depth_multiplier = depth_multiplier - self.depth_divisible_by = depth_divisible_by - self.min_depth = min_depth - self.expand_ratio = expand_ratio - self.output_stride = output_stride - self.first_layer_is_expansion = first_layer_is_expansion - self.finegrained_output = finegrained_output - self.hidden_act = hidden_act - self.tf_padding = tf_padding - self.classifier_dropout_prob = classifier_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.semantic_loss_ignore_index = semantic_loss_ignore_index - __all__ = ["MobileNetV2Config"] diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py index 7648658c3050..a9b8d92cb589 100755 --- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py @@ -333,7 +333,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -402,7 +402,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mobilenet_v2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) @@ -556,7 +556,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and self.config.num_labels == 1: raise ValueError("The number of labels should be greater than one") diff --git a/src/transformers/models/mobilevit/configuration_mobilevit.py b/src/transformers/models/mobilevit/configuration_mobilevit.py index 6e2081bbd8e1..16678ca6fee9 100644 --- a/src/transformers/models/mobilevit/configuration_mobilevit.py +++ b/src/transformers/models/mobilevit/configuration_mobilevit.py @@ -13,14 +13,14 @@ # limitations under the License. """MobileViT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/mobilenet_v2_1.0_224") +@strict(accept_kwargs=True) class MobileViTConfig(PreTrainedConfig): r""" neck_hidden_sizes (`list[int]`, *optional*, defaults to `[16, 32, 64, 96, 128, 160, 640]`): @@ -49,56 +49,27 @@ class MobileViTConfig(PreTrainedConfig): model_type = "mobilevit" - def __init__( - self, - num_channels=3, - image_size=256, - patch_size=2, - hidden_sizes=[144, 192, 240], - neck_hidden_sizes=[16, 32, 64, 96, 128, 160, 640], - num_attention_heads=4, - mlp_ratio=2.0, - expand_ratio=4.0, - hidden_act="silu", - conv_kernel_size=3, - output_stride=32, - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.0, - classifier_dropout_prob=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - qkv_bias=True, - aspp_out_channels=256, - atrous_rates=[6, 12, 18], - aspp_dropout_prob=0.1, - semantic_loss_ignore_index=255, - **kwargs, - ): - super().__init__(**kwargs) - - self.num_channels = num_channels - self.image_size = image_size - self.patch_size = patch_size - self.hidden_sizes = hidden_sizes - self.neck_hidden_sizes = neck_hidden_sizes - self.num_attention_heads = num_attention_heads - self.mlp_ratio = mlp_ratio - self.expand_ratio = expand_ratio - self.hidden_act = hidden_act - self.conv_kernel_size = conv_kernel_size - self.output_stride = output_stride - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.classifier_dropout_prob = classifier_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.qkv_bias = qkv_bias - - # decode head attributes for semantic segmentation - self.aspp_out_channels = aspp_out_channels - self.atrous_rates = atrous_rates - self.aspp_dropout_prob = aspp_dropout_prob - self.semantic_loss_ignore_index = semantic_loss_ignore_index + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 256 + patch_size: int | list[int] | tuple[int, int] = 2 + hidden_sizes: list[int] | tuple[int, ...] = (144, 192, 240) + neck_hidden_sizes: list[int] | tuple[int, ...] = (16, 32, 64, 96, 128, 160, 640) + num_attention_heads: int = 4 + mlp_ratio: float = 2.0 + expand_ratio: float = 4.0 + hidden_act: str = "silu" + conv_kernel_size: int = 3 + output_stride: int = 32 + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.0 + classifier_dropout_prob: float = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + qkv_bias: bool = True + aspp_out_channels: int = 256 + atrous_rates: list[int] | tuple[int, ...] = (6, 12, 18) + aspp_dropout_prob: float = 0.1 + semantic_loss_ignore_index: int = 255 __all__ = ["MobileViTConfig"] diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py index bdb4953da3e0..aff43a5daa7a 100755 --- a/src/transformers/models/mobilevit/modeling_mobilevit.py +++ b/src/transformers/models/mobilevit/modeling_mobilevit.py @@ -666,7 +666,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -736,7 +736,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mobilevit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) @@ -927,7 +927,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and self.config.num_labels == 1: raise ValueError("The number of labels should be greater than one") diff --git a/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py index 044960e34409..9e10a38e04d2 100644 --- a/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py +++ b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py @@ -13,14 +13,14 @@ # limitations under the License. """MobileViTV2 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="apple/mobilevitv2-1.0") +@strict(accept_kwargs=True) class MobileViTV2Config(PreTrainedConfig): r""" aspp_out_channels (`int`, *optional*, defaults to 512): @@ -57,54 +57,26 @@ class MobileViTV2Config(PreTrainedConfig): model_type = "mobilevitv2" - def __init__( - self, - num_channels=3, - image_size=256, - patch_size=2, - expand_ratio=2.0, - hidden_act="swish", - conv_kernel_size=3, - output_stride=32, - classifier_dropout_prob=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - aspp_out_channels=512, - atrous_rates=[6, 12, 18], - aspp_dropout_prob=0.1, - semantic_loss_ignore_index=255, - n_attn_blocks=[2, 4, 3], - base_attn_unit_dims=[128, 192, 256], - width_multiplier=1.0, - ffn_multiplier=2, - attn_dropout=0.0, - ffn_dropout=0.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.num_channels = num_channels - self.image_size = image_size - self.patch_size = patch_size - self.expand_ratio = expand_ratio - self.hidden_act = hidden_act - self.conv_kernel_size = conv_kernel_size - self.output_stride = output_stride - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.n_attn_blocks = n_attn_blocks - self.base_attn_unit_dims = base_attn_unit_dims - self.width_multiplier = width_multiplier - self.ffn_multiplier = ffn_multiplier - self.ffn_dropout = ffn_dropout - self.attn_dropout = attn_dropout - self.classifier_dropout_prob = classifier_dropout_prob - - # decode head attributes for semantic segmentation - self.aspp_out_channels = aspp_out_channels - self.atrous_rates = atrous_rates - self.aspp_dropout_prob = aspp_dropout_prob - self.semantic_loss_ignore_index = semantic_loss_ignore_index + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 256 + patch_size: int | list[int] | tuple[int, int] = 2 + expand_ratio: float = 2.0 + hidden_act: str = "swish" + conv_kernel_size: int = 3 + output_stride: int = 32 + classifier_dropout_prob: float = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + aspp_out_channels: int = 512 + atrous_rates: list[int] | tuple[int, ...] = (6, 12, 18) + aspp_dropout_prob: float = 0.1 + semantic_loss_ignore_index: int = 255 + n_attn_blocks: list[int] | tuple[int, ...] = (2, 4, 3) + base_attn_unit_dims: list[int] | tuple[int, ...] = (128, 192, 256) + width_multiplier: float = 1.0 + ffn_multiplier: int = 2 + attn_dropout: float | int = 0.0 + ffn_dropout: float | int = 0.0 __all__ = ["MobileViTV2Config"] diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py index bda5646de0e0..2e28695ae1f2 100644 --- a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py +++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py @@ -629,7 +629,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -701,7 +701,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mobilevitv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) @@ -895,7 +895,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and self.config.num_labels == 1: raise ValueError("The number of labels should be greater than one") diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index 9ee390420946..35ed83d8c40e 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -21,15 +21,14 @@ from typing import Literal -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging +from huggingface_hub.dataclasses import strict - -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="answerdotai/ModernBERT-base") +@strict(accept_kwargs=True) class ModernBertConfig(PreTrainedConfig): r""" initializer_cutoff_factor (`float`, *optional*, defaults to 2.0): @@ -84,107 +83,52 @@ class ModernBertConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] default_theta = {"global": 160_000.0, "local": 10_000.0} - def __setattr__(self, name, value): - if name == "reference_compile" and value is not None: - logger.warning_once( - "The `reference_compile` argument is deprecated and will be removed in `transformers v5.2.0`" - "Use `torch.compile()` directly on the model instead." - ) - value = None - super().__setattr__(name, value) - - def __init__( - self, - vocab_size: int | None = 50368, - hidden_size: int | None = 768, - intermediate_size: int | None = 1152, - num_hidden_layers: int | None = 22, - num_attention_heads: int | None = 12, - hidden_activation: str | None = "gelu", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - initializer_cutoff_factor: float | None = 2.0, - norm_eps: float | None = 1e-5, - norm_bias: bool | None = False, - pad_token_id: int | None = 50283, - eos_token_id: int | None = 50282, - bos_token_id: int | None = 50281, - cls_token_id: int | None = 50281, - sep_token_id: int | None = 50282, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - layer_types: list[str] | None = None, - rope_parameters: dict[Literal["full_attention", "sliding_attention"], RopeParameters] | None = None, - local_attention: int | None = 128, - embedding_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - mlp_dropout: float | None = 0.0, - decoder_bias: bool | None = True, - classifier_pooling: Literal["cls", "mean"] = "cls", - classifier_dropout: float | None = 0.0, - classifier_bias: bool | None = False, - classifier_activation: str | None = "gelu", - deterministic_flash_attn: bool | None = False, - sparse_prediction: bool | None = False, - sparse_pred_ignore_index: int | None = -100, - reference_compile: bool | None = None, # Deprecated - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.cls_token_id = cls_token_id - self.sep_token_id = sep_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.initializer_range = initializer_range - self.initializer_cutoff_factor = initializer_cutoff_factor - self.norm_eps = norm_eps - self.norm_bias = norm_bias - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.local_attention = local_attention - self.embedding_dropout = embedding_dropout - self.mlp_bias = mlp_bias - self.mlp_dropout = mlp_dropout - self.decoder_bias = decoder_bias - self.classifier_pooling = classifier_pooling - self.classifier_dropout = classifier_dropout - self.classifier_bias = classifier_bias - self.classifier_activation = classifier_activation - self.deterministic_flash_attn = deterministic_flash_attn - self.sparse_prediction = sparse_prediction - self.sparse_pred_ignore_index = sparse_pred_ignore_index - self.reference_compile = reference_compile - - if self.classifier_pooling not in ["cls", "mean"]: - raise ValueError( - f'Invalid value for `classifier_pooling`, should be either "cls" or "mean", but is {self.classifier_pooling}.' - ) - - self.layer_types = layer_types - + vocab_size: int = 50368 + hidden_size: int = 768 + intermediate_size: int = 1152 + num_hidden_layers: int = 22 + num_attention_heads: int = 12 + hidden_activation: str = "gelu" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + initializer_cutoff_factor: float = 2.0 + norm_eps: float = 1e-5 + norm_bias: bool = False + pad_token_id: int | None = 50283 + eos_token_id: int | list[int] | None = 50282 + bos_token_id: int | None = 50281 + cls_token_id: int | None = 50281 + sep_token_id: int | None = 50282 + attention_bias: bool = False + attention_dropout: float | int = 0.0 + layer_types: list[str] | None = None + rope_parameters: dict[Literal["full_attention", "sliding_attention"], dict] | None = None + local_attention: int = 128 + embedding_dropout: float | int = 0.0 + mlp_bias: bool = False + mlp_dropout: float | int = 0.0 + decoder_bias: bool = True + classifier_pooling: Literal["cls", "mean"] = "cls" + classifier_dropout: float | int = 0.0 + classifier_bias: bool = False + classifier_activation: str = "gelu" + deterministic_flash_attn: bool = False + sparse_prediction: bool = False + sparse_pred_ignore_index: int = -100 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub - self.global_attn_every_n_layers = kwargs.get("global_attn_every_n_layers", 3) - + global_attn_every_n_layers = kwargs.get("global_attn_every_n_layers", 3) if self.layer_types is None: self.layer_types = [ - "sliding_attention" if bool(i % self.global_attn_every_n_layers) else "full_attention" + "sliding_attention" if bool(i % global_attn_every_n_layers) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -212,7 +156,6 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs def to_dict(self): diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 8f74ffda1aac..b3d6587c5a5c 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -17,12 +17,13 @@ from typing import Literal, Optional import torch +from huggingface_hub.dataclasses import strict from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ... import initialization as init from ...activations import ACT2FN -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...integrations import use_kernel_func_from_hub, use_kernelized_func from ...masking_utils import create_bidirectional_mask, create_bidirectional_sliding_window_mask from ...modeling_layers import GradientCheckpointingLayer @@ -34,7 +35,7 @@ SequenceClassifierOutput, TokenClassifierOutput, ) -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, RopeParameters +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, logging @@ -48,6 +49,7 @@ @auto_docstring(checkpoint="answerdotai/ModernBERT-base") +@strict(accept_kwargs=True) class ModernBertConfig(PreTrainedConfig): r""" initializer_cutoff_factor (`float`, *optional*, defaults to 2.0): @@ -102,107 +104,52 @@ class ModernBertConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] default_theta = {"global": 160_000.0, "local": 10_000.0} - def __setattr__(self, name, value): - if name == "reference_compile" and value is not None: - logger.warning_once( - "The `reference_compile` argument is deprecated and will be removed in `transformers v5.2.0`" - "Use `torch.compile()` directly on the model instead." - ) - value = None - super().__setattr__(name, value) - - def __init__( - self, - vocab_size: int | None = 50368, - hidden_size: int | None = 768, - intermediate_size: int | None = 1152, - num_hidden_layers: int | None = 22, - num_attention_heads: int | None = 12, - hidden_activation: str | None = "gelu", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - initializer_cutoff_factor: float | None = 2.0, - norm_eps: float | None = 1e-5, - norm_bias: bool | None = False, - pad_token_id: int | None = 50283, - eos_token_id: int | None = 50282, - bos_token_id: int | None = 50281, - cls_token_id: int | None = 50281, - sep_token_id: int | None = 50282, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - layer_types: list[str] | None = None, - rope_parameters: dict[Literal["full_attention", "sliding_attention"], RopeParameters] | None = None, - local_attention: int | None = 128, - embedding_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - mlp_dropout: float | None = 0.0, - decoder_bias: bool | None = True, - classifier_pooling: Literal["cls", "mean"] = "cls", - classifier_dropout: float | None = 0.0, - classifier_bias: bool | None = False, - classifier_activation: str | None = "gelu", - deterministic_flash_attn: bool | None = False, - sparse_prediction: bool | None = False, - sparse_pred_ignore_index: int | None = -100, - reference_compile: bool | None = None, # Deprecated - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.cls_token_id = cls_token_id - self.sep_token_id = sep_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.initializer_range = initializer_range - self.initializer_cutoff_factor = initializer_cutoff_factor - self.norm_eps = norm_eps - self.norm_bias = norm_bias - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.local_attention = local_attention - self.embedding_dropout = embedding_dropout - self.mlp_bias = mlp_bias - self.mlp_dropout = mlp_dropout - self.decoder_bias = decoder_bias - self.classifier_pooling = classifier_pooling - self.classifier_dropout = classifier_dropout - self.classifier_bias = classifier_bias - self.classifier_activation = classifier_activation - self.deterministic_flash_attn = deterministic_flash_attn - self.sparse_prediction = sparse_prediction - self.sparse_pred_ignore_index = sparse_pred_ignore_index - self.reference_compile = reference_compile - - if self.classifier_pooling not in ["cls", "mean"]: - raise ValueError( - f'Invalid value for `classifier_pooling`, should be either "cls" or "mean", but is {self.classifier_pooling}.' - ) - - self.layer_types = layer_types - + vocab_size: int = 50368 + hidden_size: int = 768 + intermediate_size: int = 1152 + num_hidden_layers: int = 22 + num_attention_heads: int = 12 + hidden_activation: str = "gelu" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + initializer_cutoff_factor: float = 2.0 + norm_eps: float = 1e-5 + norm_bias: bool = False + pad_token_id: int | None = 50283 + eos_token_id: int | list[int] | None = 50282 + bos_token_id: int | None = 50281 + cls_token_id: int | None = 50281 + sep_token_id: int | None = 50282 + attention_bias: bool = False + attention_dropout: float | int = 0.0 + layer_types: list[str] | None = None + rope_parameters: dict[Literal["full_attention", "sliding_attention"], dict] | None = None + local_attention: int = 128 + embedding_dropout: float | int = 0.0 + mlp_bias: bool = False + mlp_dropout: float | int = 0.0 + decoder_bias: bool = True + classifier_pooling: Literal["cls", "mean"] = "cls" + classifier_dropout: float | int = 0.0 + classifier_bias: bool = False + classifier_activation: str = "gelu" + deterministic_flash_attn: bool = False + sparse_prediction: bool = False + sparse_pred_ignore_index: int = -100 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub - self.global_attn_every_n_layers = kwargs.get("global_attn_every_n_layers", 3) - + global_attn_every_n_layers = kwargs.get("global_attn_every_n_layers", 3) if self.layer_types is None: self.layer_types = [ - "sliding_attention" if bool(i % self.global_attn_every_n_layers) else "full_attention" + "sliding_attention" if bool(i % global_attn_every_n_layers) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -230,7 +177,6 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs def to_dict(self): diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index 4ecc76e360fe..bc6460c7637c 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -20,12 +20,14 @@ # limitations under the License. from typing import Literal +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="blab-jhu/test-32m-dec") +@strict(accept_kwargs=True) class ModernBertDecoderConfig(PreTrainedConfig): r""" initializer_cutoff_factor (`float`, *optional*, defaults to 2.0): @@ -69,89 +71,53 @@ class ModernBertDecoderConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] default_theta = {"global": 160_000.0, "local": 10_000.0} - def __init__( - self, - vocab_size: int | None = 50368, - hidden_size: int | None = 768, - intermediate_size: int | None = 1152, - num_hidden_layers: int | None = 22, - num_attention_heads: int | None = 12, - hidden_activation: str | None = "gelu", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - initializer_cutoff_factor: float | None = 2.0, - norm_eps: int | None = 1e-5, - norm_bias: bool | None = False, - pad_token_id: int | None = 50283, - eos_token_id: int | None = 50282, - bos_token_id: int | None = 50281, - cls_token_id: int | None = 50281, - sep_token_id: int | None = 50282, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - embedding_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - mlp_dropout: float | None = 0.0, - decoder_bias: bool | None = True, - classifier_dropout: float | None = 0.0, - classifier_bias: bool | None = False, - classifier_activation: str | None = "gelu", - use_cache: bool | None = True, - local_attention: int | None = 128, - global_attn_every_n_layers: int | None = 3, - layer_types: list[str] | None = None, - tie_word_embeddings: bool | None = True, - rope_parameters: dict[Literal["full_attention", "sliding_attention"], RopeParameters] | None = None, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.cls_token_id = cls_token_id - self.sep_token_id = sep_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.initializer_range = initializer_range - self.initializer_cutoff_factor = initializer_cutoff_factor - self.norm_eps = norm_eps - self.norm_bias = norm_bias - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.embedding_dropout = embedding_dropout - self.mlp_bias = mlp_bias - self.mlp_dropout = mlp_dropout - self.decoder_bias = decoder_bias - self.classifier_dropout = classifier_dropout - self.classifier_bias = classifier_bias - self.classifier_activation = classifier_activation - self.use_cache = use_cache - self.global_attn_every_n_layers = global_attn_every_n_layers - # for consistency with ModernBert - self.reference_compile = False - - # Set up layer_types for standardized layer type detection - self.layer_types = layer_types + vocab_size: int = 50368 + hidden_size: int = 768 + intermediate_size: int = 1152 + num_hidden_layers: int = 22 + num_attention_heads: int = 12 + hidden_activation: str = "gelu" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + initializer_cutoff_factor: float = 2.0 + norm_eps: float = 1e-5 + norm_bias: bool = False + pad_token_id: int = 50283 + eos_token_id: int | list[int] | None = 50282 + bos_token_id: int = 50281 + cls_token_id: int = 50281 + sep_token_id: int = 50282 + attention_bias: bool = False + attention_dropout: float | int = 0.0 + embedding_dropout: float | int = 0.0 + mlp_bias: bool = False + mlp_dropout: float | int = 0.0 + decoder_bias: bool = True + classifier_dropout: float | int = 0.0 + classifier_bias: bool = False + classifier_activation: str = "gelu" + use_cache: bool = True + local_attention: int | None = 128 + layer_types: list[str] | None = None + tie_word_embeddings: bool = True + rope_parameters: dict[Literal["full_attention", "sliding_attention"], dict] | None = None + + def __post_init__(self, **kwargs): + # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub + global_attn_every_n_layers = kwargs.get("global_attn_every_n_layers", 3) if self.layer_types is None: - # Create layer_types based on the alternating pattern self.layer_types = [] - for layer_id in range(num_hidden_layers): + for layer_id in range(self.num_hidden_layers): if layer_id % global_attn_every_n_layers != 0: self.layer_types.append("sliding_attention") else: self.layer_types.append("full_attention") # NOTE: sliding window numbers matches ModernBERT but is only half of it - self.sliding_window = local_attention // 2 if local_attention else -1 - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + self.sliding_window = self.local_attention // 2 if self.local_attention else -1 + super().__post_init__(**kwargs) - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -179,7 +145,6 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py index 1cb956151c70..47294e277aca 100644 --- a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py @@ -209,7 +209,7 @@ def eager_attention_forward( key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, - dropout: float = 0.0, + dropout: float | int = 0.0, scaling: float | None = None, sliding_window: int | None = None, **kwargs, diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index f0c439847c9a..8c3ace76b059 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -17,6 +17,7 @@ from typing import Literal import torch +from huggingface_hub.dataclasses import strict from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss @@ -27,7 +28,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, RopeParameters +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -47,6 +48,7 @@ @auto_docstring(checkpoint="blab-jhu/test-32m-dec") +@strict(accept_kwargs=True) class ModernBertDecoderConfig(PreTrainedConfig): r""" initializer_cutoff_factor (`float`, *optional*, defaults to 2.0): @@ -90,89 +92,53 @@ class ModernBertDecoderConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] default_theta = {"global": 160_000.0, "local": 10_000.0} - def __init__( - self, - vocab_size: int | None = 50368, - hidden_size: int | None = 768, - intermediate_size: int | None = 1152, - num_hidden_layers: int | None = 22, - num_attention_heads: int | None = 12, - hidden_activation: str | None = "gelu", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - initializer_cutoff_factor: float | None = 2.0, - norm_eps: int | None = 1e-5, - norm_bias: bool | None = False, - pad_token_id: int | None = 50283, - eos_token_id: int | None = 50282, - bos_token_id: int | None = 50281, - cls_token_id: int | None = 50281, - sep_token_id: int | None = 50282, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - embedding_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - mlp_dropout: float | None = 0.0, - decoder_bias: bool | None = True, - classifier_dropout: float | None = 0.0, - classifier_bias: bool | None = False, - classifier_activation: str | None = "gelu", - use_cache: bool | None = True, - local_attention: int | None = 128, - global_attn_every_n_layers: int | None = 3, - layer_types: list[str] | None = None, - tie_word_embeddings: bool | None = True, - rope_parameters: dict[Literal["full_attention", "sliding_attention"], RopeParameters] | None = None, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.cls_token_id = cls_token_id - self.sep_token_id = sep_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.initializer_range = initializer_range - self.initializer_cutoff_factor = initializer_cutoff_factor - self.norm_eps = norm_eps - self.norm_bias = norm_bias - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.embedding_dropout = embedding_dropout - self.mlp_bias = mlp_bias - self.mlp_dropout = mlp_dropout - self.decoder_bias = decoder_bias - self.classifier_dropout = classifier_dropout - self.classifier_bias = classifier_bias - self.classifier_activation = classifier_activation - self.use_cache = use_cache - self.global_attn_every_n_layers = global_attn_every_n_layers - # for consistency with ModernBert - self.reference_compile = False - - # Set up layer_types for standardized layer type detection - self.layer_types = layer_types + vocab_size: int = 50368 + hidden_size: int = 768 + intermediate_size: int = 1152 + num_hidden_layers: int = 22 + num_attention_heads: int = 12 + hidden_activation: str = "gelu" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + initializer_cutoff_factor: float = 2.0 + norm_eps: float = 1e-5 + norm_bias: bool = False + pad_token_id: int = 50283 + eos_token_id: int | list[int] | None = 50282 + bos_token_id: int = 50281 + cls_token_id: int = 50281 + sep_token_id: int = 50282 + attention_bias: bool = False + attention_dropout: float | int = 0.0 + embedding_dropout: float | int = 0.0 + mlp_bias: bool = False + mlp_dropout: float | int = 0.0 + decoder_bias: bool = True + classifier_dropout: float | int = 0.0 + classifier_bias: bool = False + classifier_activation: str = "gelu" + use_cache: bool = True + local_attention: int | None = 128 + layer_types: list[str] | None = None + tie_word_embeddings: bool = True + rope_parameters: dict[Literal["full_attention", "sliding_attention"], dict] | None = None + + def __post_init__(self, **kwargs): + # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub + global_attn_every_n_layers = kwargs.get("global_attn_every_n_layers", 3) if self.layer_types is None: - # Create layer_types based on the alternating pattern self.layer_types = [] - for layer_id in range(num_hidden_layers): + for layer_id in range(self.num_hidden_layers): if layer_id % global_attn_every_n_layers != 0: self.layer_types.append("sliding_attention") else: self.layer_types.append("full_attention") # NOTE: sliding window numbers matches ModernBERT but is only half of it - self.sliding_window = local_attention // 2 if local_attention else -1 - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + self.sliding_window = self.local_attention // 2 if self.local_attention else -1 + super().__post_init__(**kwargs) - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -200,7 +166,6 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs @@ -222,7 +187,7 @@ def eager_attention_forward( key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, - dropout: float = 0.0, + dropout: float | int = 0.0, scaling: float | None = None, sliding_window: int | None = None, **kwargs, diff --git a/src/transformers/models/modernvbert/configuration_modernvbert.py b/src/transformers/models/modernvbert/configuration_modernvbert.py index a64f0ac75ce4..c78b630e3347 100755 --- a/src/transformers/models/modernvbert/configuration_modernvbert.py +++ b/src/transformers/models/modernvbert/configuration_modernvbert.py @@ -18,15 +18,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Literal +from typing import Literal -from ...configuration_utils import PretrainedConfig +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="ModernVBERT/modernvbert") -class ModernVBertConfig(PretrainedConfig): +@strict(accept_kwargs=True) +class ModernVBertConfig(PreTrainedConfig): r""" pixel_shuffle_factor (`int | None`, *optional*, defaults to 4): Scale factor used by any pixel-shuffle / upsampling operations in the vision head. initializer_cutoff_factor (`float | None`, *optional*, defaults to 2.0): The cutoff factor for the truncated_normal_initializer for initializing all weight matrices. @@ -51,46 +54,30 @@ class ModernVBertConfig(PretrainedConfig): ```""" model_type = "modernvbert" - sub_configs: dict[str, Any] = {"text_config": AutoConfig, "vision_config": AutoConfig} - - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id: int | None = 50407, - pixel_shuffle_factor: int | None = 4, - initializer_range: float | None = 0.02, - initializer_cutoff_factor: float | None = 2.0, - classifier_pooling: Literal["cls", "mean"] = "cls", - classifier_dropout: float | None = 0.0, - classifier_bias: bool | None = False, - **kwargs, - ): - if classifier_pooling not in ["cls", "mean"]: - raise ValueError( - f'Invalid value for `classifier_pooling`, should be either "cls" or "mean", but is {classifier_pooling}.' - ) - - if text_config is None: - text_config = CONFIG_MAPPING["modernbert"]() - elif isinstance(text_config, dict): - text_config = CONFIG_MAPPING["modernbert"](**text_config) - self.text_config = text_config - - if vision_config is None: - vision_config = CONFIG_MAPPING["siglip_vision_model"]() - elif isinstance(vision_config, dict): - vision_config = CONFIG_MAPPING["siglip_vision_model"](**vision_config) - self.vision_config = vision_config - - self.pixel_shuffle_factor = pixel_shuffle_factor - self.initializer_range = initializer_range - self.initializer_cutoff_factor = initializer_cutoff_factor - self.classifier_pooling = classifier_pooling - self.classifier_dropout = classifier_dropout - self.classifier_bias = classifier_bias - - super().__init__(image_token_id=image_token_id, **kwargs) + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + text_config: PreTrainedConfig | dict | None = None + vision_config: PreTrainedConfig | dict | None = None + image_token_id: int = 50407 + pixel_shuffle_factor: int = 4 + initializer_range: float = 0.02 + initializer_cutoff_factor: float = 2.0 + classifier_pooling: Literal["cls", "mean"] = "cls" + classifier_dropout: float = 0.0 + classifier_bias: bool = False + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = CONFIG_MAPPING["modernbert"]() + elif isinstance(self.text_config, dict): + self.text_config = CONFIG_MAPPING["modernbert"](**self.text_config) + + if self.vision_config is None: + self.vision_config = CONFIG_MAPPING["siglip_vision_model"]() + elif isinstance(self.vision_config, dict): + self.vision_config = CONFIG_MAPPING["siglip_vision_model"](**self.vision_config) + + super().__post_init__(**kwargs) __all__ = ["ModernVBertConfig"] diff --git a/src/transformers/models/modernvbert/modular_modernvbert.py b/src/transformers/models/modernvbert/modular_modernvbert.py index 54344c75eeca..b421a82619f7 100755 --- a/src/transformers/models/modernvbert/modular_modernvbert.py +++ b/src/transformers/models/modernvbert/modular_modernvbert.py @@ -14,14 +14,15 @@ import math from dataclasses import dataclass -from typing import Any, Literal +from typing import Literal import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ... import initialization as init -from ...configuration_utils import PretrainedConfig +from ...configuration_utils import PreTrainedConfig from ...modeling_outputs import ( BaseModelOutput, MaskedLMOutput, @@ -41,7 +42,8 @@ @auto_docstring(checkpoint="ModernVBERT/modernvbert") -class ModernVBertConfig(PretrainedConfig): +@strict(accept_kwargs=True) +class ModernVBertConfig(PreTrainedConfig): r""" pixel_shuffle_factor (`int | None`, *optional*, defaults to 4): Scale factor used by any pixel-shuffle / upsampling operations in the vision head. initializer_cutoff_factor (`float | None`, *optional*, defaults to 2.0): The cutoff factor for the truncated_normal_initializer for initializing all weight matrices. @@ -66,46 +68,30 @@ class ModernVBertConfig(PretrainedConfig): ```""" model_type = "modernvbert" - sub_configs: dict[str, Any] = {"text_config": AutoConfig, "vision_config": AutoConfig} - - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id: int | None = 50407, - pixel_shuffle_factor: int | None = 4, - initializer_range: float | None = 0.02, - initializer_cutoff_factor: float | None = 2.0, - classifier_pooling: Literal["cls", "mean"] = "cls", - classifier_dropout: float | None = 0.0, - classifier_bias: bool | None = False, - **kwargs, - ): - if classifier_pooling not in ["cls", "mean"]: - raise ValueError( - f'Invalid value for `classifier_pooling`, should be either "cls" or "mean", but is {classifier_pooling}.' - ) - - if text_config is None: - text_config = CONFIG_MAPPING["modernbert"]() - elif isinstance(text_config, dict): - text_config = CONFIG_MAPPING["modernbert"](**text_config) - self.text_config = text_config - - if vision_config is None: - vision_config = CONFIG_MAPPING["siglip_vision_model"]() - elif isinstance(vision_config, dict): - vision_config = CONFIG_MAPPING["siglip_vision_model"](**vision_config) - self.vision_config = vision_config - - self.pixel_shuffle_factor = pixel_shuffle_factor - self.initializer_range = initializer_range - self.initializer_cutoff_factor = initializer_cutoff_factor - self.classifier_pooling = classifier_pooling - self.classifier_dropout = classifier_dropout - self.classifier_bias = classifier_bias - - super().__init__(image_token_id=image_token_id, **kwargs) + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + text_config: PreTrainedConfig | dict | None = None + vision_config: PreTrainedConfig | dict | None = None + image_token_id: int = 50407 + pixel_shuffle_factor: int = 4 + initializer_range: float = 0.02 + initializer_cutoff_factor: float = 2.0 + classifier_pooling: Literal["cls", "mean"] = "cls" + classifier_dropout: float = 0.0 + classifier_bias: bool = False + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = CONFIG_MAPPING["modernbert"]() + elif isinstance(self.text_config, dict): + self.text_config = CONFIG_MAPPING["modernbert"](**self.text_config) + + if self.vision_config is None: + self.vision_config = CONFIG_MAPPING["siglip_vision_model"]() + elif isinstance(self.vision_config, dict): + self.vision_config = CONFIG_MAPPING["siglip_vision_model"](**self.vision_config) + + super().__post_init__(**kwargs) @dataclass diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index e88e36156557..07e4f441494c 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -18,12 +18,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="UsefulSensors/moonshine-tiny") +@strict(accept_kwargs=True) class MoonshineConfig(PreTrainedConfig): r""" encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): @@ -74,70 +77,40 @@ class MoonshineConfig(PreTrainedConfig): "hidden_act": "decoder_hidden_act", } - def __init__( - self, - vocab_size: int | None = 32768, - hidden_size: int | None = 288, - intermediate_size: int | None = 1152, - encoder_num_hidden_layers: int | None = 6, - decoder_num_hidden_layers: int | None = 6, - encoder_num_attention_heads: int | None = 8, - decoder_num_attention_heads: int | None = 8, - encoder_num_key_value_heads: int | None = None, - decoder_num_key_value_heads: int | None = None, - pad_head_dim_to_multiple_of: int | None = None, - encoder_hidden_act: str | None = "gelu", - decoder_hidden_act: str | None = "silu", - max_position_embeddings: int | None = 512, - initializer_range: float | None = 0.02, - decoder_start_token_id: int | None = 1, - use_cache: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - is_encoder_decoder: bool | None = True, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - pad_token_id: int | None = None, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.encoder_num_hidden_layers = encoder_num_hidden_layers - self.decoder_num_hidden_layers = decoder_num_hidden_layers - self.encoder_num_attention_heads = encoder_num_attention_heads - self.decoder_num_attention_heads = decoder_num_attention_heads - - if encoder_num_key_value_heads is None: - encoder_num_key_value_heads = encoder_num_attention_heads - self.encoder_num_key_value_heads = encoder_num_key_value_heads - - if decoder_num_key_value_heads is None: - decoder_num_key_value_heads = decoder_num_attention_heads - self.decoder_num_key_value_heads = decoder_num_key_value_heads - - self.pad_head_dim_to_multiple_of = pad_head_dim_to_multiple_of - - self.encoder_hidden_act = encoder_hidden_act - self.decoder_hidden_act = decoder_hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.decoder_start_token_id = decoder_start_token_id - self.use_cache = use_cache - self.is_encoder_decoder = is_encoder_decoder - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = tie_word_embeddings - self.rope_parameters = rope_parameters - kwargs.setdefault("partial_rotary_factor", 0.9) # assign default for BC + vocab_size: int = 32768 + hidden_size: int = 288 + intermediate_size: int = 1152 + encoder_num_hidden_layers: int = 6 + decoder_num_hidden_layers: int = 6 + encoder_num_attention_heads: int = 8 + decoder_num_attention_heads: int = 8 + encoder_num_key_value_heads: int | None = None + decoder_num_key_value_heads: int | None = None + pad_head_dim_to_multiple_of: int | None = None + encoder_hidden_act: str = "gelu" + decoder_hidden_act: str = "silu" + max_position_embeddings: int = 512 + initializer_range: float = 0.02 + decoder_start_token_id: int = 1 + use_cache: bool = True + rope_parameters: RopeParameters | dict | None = None + is_encoder_decoder: bool = True + attention_bias: bool = False + attention_dropout: float | int = 0.0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + pad_token_id: int | None = None + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.encoder_num_key_value_heads is None: + self.encoder_num_key_value_heads = self.encoder_num_attention_heads + + if self.decoder_num_key_value_heads is None: + self.decoder_num_key_value_heads = self.decoder_num_attention_heads - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + kwargs.setdefault("partial_rotary_factor", 0.9) # assign default for BC + super().__post_init__(**kwargs) __all__ = ["MoonshineConfig"] diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 4e0f7186c43c..0165a1fe769f 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -17,6 +17,7 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache @@ -47,6 +48,7 @@ @auto_docstring(checkpoint="UsefulSensors/moonshine-tiny") +@strict(accept_kwargs=True) class MoonshineConfig(PreTrainedConfig): r""" encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): @@ -97,70 +99,40 @@ class MoonshineConfig(PreTrainedConfig): "hidden_act": "decoder_hidden_act", } - def __init__( - self, - vocab_size: int | None = 32768, - hidden_size: int | None = 288, - intermediate_size: int | None = 1152, - encoder_num_hidden_layers: int | None = 6, - decoder_num_hidden_layers: int | None = 6, - encoder_num_attention_heads: int | None = 8, - decoder_num_attention_heads: int | None = 8, - encoder_num_key_value_heads: int | None = None, - decoder_num_key_value_heads: int | None = None, - pad_head_dim_to_multiple_of: int | None = None, - encoder_hidden_act: str | None = "gelu", - decoder_hidden_act: str | None = "silu", - max_position_embeddings: int | None = 512, - initializer_range: float | None = 0.02, - decoder_start_token_id: int | None = 1, - use_cache: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - is_encoder_decoder: bool | None = True, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - pad_token_id: int | None = None, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.encoder_num_hidden_layers = encoder_num_hidden_layers - self.decoder_num_hidden_layers = decoder_num_hidden_layers - self.encoder_num_attention_heads = encoder_num_attention_heads - self.decoder_num_attention_heads = decoder_num_attention_heads - - if encoder_num_key_value_heads is None: - encoder_num_key_value_heads = encoder_num_attention_heads - self.encoder_num_key_value_heads = encoder_num_key_value_heads - - if decoder_num_key_value_heads is None: - decoder_num_key_value_heads = decoder_num_attention_heads - self.decoder_num_key_value_heads = decoder_num_key_value_heads - - self.pad_head_dim_to_multiple_of = pad_head_dim_to_multiple_of - - self.encoder_hidden_act = encoder_hidden_act - self.decoder_hidden_act = decoder_hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.decoder_start_token_id = decoder_start_token_id - self.use_cache = use_cache - self.is_encoder_decoder = is_encoder_decoder - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = tie_word_embeddings - self.rope_parameters = rope_parameters - kwargs.setdefault("partial_rotary_factor", 0.9) # assign default for BC + vocab_size: int = 32768 + hidden_size: int = 288 + intermediate_size: int = 1152 + encoder_num_hidden_layers: int = 6 + decoder_num_hidden_layers: int = 6 + encoder_num_attention_heads: int = 8 + decoder_num_attention_heads: int = 8 + encoder_num_key_value_heads: int | None = None + decoder_num_key_value_heads: int | None = None + pad_head_dim_to_multiple_of: int | None = None + encoder_hidden_act: str = "gelu" + decoder_hidden_act: str = "silu" + max_position_embeddings: int = 512 + initializer_range: float = 0.02 + decoder_start_token_id: int = 1 + use_cache: bool = True + rope_parameters: RopeParameters | dict | None = None + is_encoder_decoder: bool = True + attention_bias: bool = False + attention_dropout: float | int = 0.0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + pad_token_id: int | None = None + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.encoder_num_key_value_heads is None: + self.encoder_num_key_value_heads = self.encoder_num_attention_heads + + if self.decoder_num_key_value_heads is None: + self.decoder_num_key_value_heads = self.decoder_num_attention_heads - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + kwargs.setdefault("partial_rotary_factor", 0.9) # assign default for BC + super().__post_init__(**kwargs) @dataclass diff --git a/src/transformers/models/moonshine_streaming/configuration_moonshine_streaming.py b/src/transformers/models/moonshine_streaming/configuration_moonshine_streaming.py index 24f61ddd129b..4df02f3d0189 100644 --- a/src/transformers/models/moonshine_streaming/configuration_moonshine_streaming.py +++ b/src/transformers/models/moonshine_streaming/configuration_moonshine_streaming.py @@ -13,6 +13,8 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @@ -20,6 +22,7 @@ @auto_docstring(checkpoint="UsefulSensors/moonshine-streaming-tiny") +@strict(accept_kwargs=True) class MoonshineStreamingEncoderConfig(PreTrainedConfig): r""" sample_rate (`int`, *optional*, defaults to 16000): @@ -45,41 +48,36 @@ class MoonshineStreamingEncoderConfig(PreTrainedConfig): model_type = "moonshine_streaming_encoder" - def __init__( - self, - hidden_size: int | None = 320, - intermediate_size: int | None = 1280, - hidden_act: str | None = "gelu", - num_hidden_layers: int | None = 6, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 8, - max_position_embeddings: int | None = 4096, - attention_dropout: float | None = 0.0, - attention_bias: bool | None = False, - sample_rate: int = 16000, - frame_ms: float = 5.0, - sliding_windows: list[tuple[int, int]] = [(16, 4), (16, 4), (16, 0), (16, 0), (16, 4), (16, 4)], - head_dim: int | None = None, - **kwargs, - ): - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.max_position_embeddings = max_position_embeddings - self.attention_dropout = attention_dropout - self.attention_bias = attention_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.sample_rate = sample_rate - self.frame_ms = frame_ms - self.sliding_windows = [list(window) for window in sliding_windows] - - super().__init__(**kwargs) + hidden_size: int = 320 + intermediate_size: int = 1280 + hidden_act: str = "gelu" + num_hidden_layers: int = 6 + num_attention_heads: int = 8 + num_key_value_heads: int = 8 + max_position_embeddings: int = 4096 + attention_dropout: float | int = 0.0 + attention_bias: bool = False + sample_rate: int = 16000 + frame_ms: float = 5.0 + sliding_windows: tuple[tuple[int, int], ...] | list[list[int, int]] = ( + (16, 4), + (16, 4), + (16, 0), + (16, 0), + (16, 4), + (16, 4), + ) + head_dim: int | None = None + + def __post_init__(self, **kwargs): + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + self.sliding_windows = [list(window) for window in self.sliding_windows] + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="UsefulSensors/moonshine-streaming-tiny") +@strict(accept_kwargs=True) class MoonshineStreamingConfig(PreTrainedConfig): r""" pad_head_dim_to_multiple_of (`int`, *optional*): @@ -102,66 +100,42 @@ class MoonshineStreamingConfig(PreTrainedConfig): sub_configs = {"encoder_config": MoonshineStreamingEncoderConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - encoder_config: MoonshineStreamingEncoderConfig = None, - vocab_size: int = 32768, - hidden_size: int | None = 320, - intermediate_size: int | None = 1280, - num_hidden_layers: int | None = 6, - num_attention_heads: int | None = 8, - hidden_act: str | None = "silu", - max_position_embeddings: int = 4096, - use_cache: bool | None = True, - pad_token_id: int = 0, - bos_token_id: int = 1, - eos_token_id: int = 2, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = { - "rope_type": "default", - "rope_theta": 10000.0, - "partial_rotary_factor": 0.8, - }, - attention_bias: bool = False, - attention_dropout: float = 0.0, - decoder_start_token_id: int | None = None, - head_dim: int | None = None, - pad_head_dim_to_multiple_of: int | None = None, - tie_word_embeddings: bool = False, - is_encoder_decoder: bool = True, - **kwargs, - ): - if isinstance(encoder_config, dict): - encoder_config["model_type"] = encoder_config.get("model_type", "moonshine_streaming_encoder") - encoder_config = CONFIG_MAPPING[encoder_config["model_type"]](**encoder_config) - elif encoder_config is None: - encoder_config = CONFIG_MAPPING["moonshine_streaming_encoder"]() - - self.encoder_config = encoder_config - - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_attention_heads - self.hidden_act = hidden_act - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters - self.pad_head_dim_to_multiple_of = pad_head_dim_to_multiple_of - - super().__init__( - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - decoder_start_token_id=decoder_start_token_id, - tie_word_embeddings=tie_word_embeddings, - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) + encoder_config: dict | MoonshineStreamingEncoderConfig | None = None + vocab_size: int = 32768 + hidden_size: int = 320 + intermediate_size: int = 1280 + num_hidden_layers: int = 6 + num_attention_heads: int = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 4096 + use_cache: bool = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + decoder_start_token_id: int | None = None + head_dim: int | None = None + pad_head_dim_to_multiple_of: int | None = None + tie_word_embeddings: bool = False + is_encoder_decoder: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.encoder_config, dict): + self.encoder_config["model_type"] = self.encoder_config.get("model_type", "moonshine_streaming_encoder") + self.encoder_config = CONFIG_MAPPING[self.encoder_config["model_type"]](**self.encoder_config) + elif self.encoder_config is None: + self.encoder_config = CONFIG_MAPPING["moonshine_streaming_encoder"]() + + if self.rope_parameters is None: + self.rope_parameters = { + "rope_type": "default", + "rope_theta": 10000.0, + "partial_rotary_factor": 0.8, + } + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["MoonshineStreamingConfig", "MoonshineStreamingEncoderConfig"] diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index 1b21494f58a3..9151abb0eb8f 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -13,16 +13,16 @@ # limitations under the License. """Moshi model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto.configuration_auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="kmhf/hf-moshiko") +@strict(accept_kwargs=True) class MoshiDepthConfig(PreTrainedConfig): r""" input_size (`int`, *optional*, defaults to 4096): @@ -53,61 +53,49 @@ class MoshiDepthConfig(PreTrainedConfig): model_type = "moshi_depth" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size=32000, - hidden_size=1024, - input_size=4096, - num_hidden_layers=6, - num_attention_heads=16, - num_key_value_heads=None, - audio_vocab_size=2048, - max_position_embeddings=9, - hidden_act="silu", - head_dim=None, - initializer_range=0.02, - use_cache=True, - sliding_window=8, - attention_dropout=0.0, - ffn_dim=5632, - rms_norm_eps=1e-8, - num_codebooks=8, - tie_word_embeddings=False, - pad_token_id=None, - bos_token_id=None, - eos_token_id=None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.input_size = input_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.hidden_act = hidden_act - self.head_dim = head_dim or hidden_size // num_attention_heads - self.initializer_range = initializer_range - self.use_cache = use_cache - self.sliding_window = sliding_window - self.attention_dropout = attention_dropout - if ffn_dim % 2 == 1: - raise ValueError(f"`ffn_dim={ffn_dim}` must be even.") - self.ffn_dim = ffn_dim - self.rms_norm_eps = rms_norm_eps - self.num_codebooks = num_codebooks - self.audio_vocab_size = audio_vocab_size - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 32000 + hidden_size: int = 1024 + input_size: int = 4096 + num_hidden_layers: int = 6 + num_attention_heads: int = 16 + num_key_value_heads: int | None = None + audio_vocab_size: int = 2048 + max_position_embeddings: int = 9 + hidden_act: str = "silu" + head_dim: int | None = None + initializer_range: float = 0.02 + use_cache: bool = True + sliding_window: int = 8 + attention_dropout: float | int = 0.0 + ffn_dim: int = 5632 + rms_norm_eps: float = 1e-8 + num_codebooks: int = 8 + tie_word_embeddings: bool = False + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): + self.num_key_value_heads = ( + self.num_key_value_heads if self.num_key_value_heads is not None else self.num_attention_heads + ) + self.head_dim = self.head_dim or self.hidden_size // self.num_attention_heads + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.ffn_dim % 2 == 1: + raise ValueError(f"`ffn_dim={self.ffn_dim}` must be even.") @auto_docstring(checkpoint="kmhf/hf-moshiko") +@strict(accept_kwargs=True) class MoshiConfig(PreTrainedConfig): """ + depth_decoder_config (`PreTrainedConfig | dict`, *optional*): + Configuration for the depth decoder. + audio_encoder_config (`PreTrainedConfig | dict`, *optional*): + Configuration for the audio encoder. audio_vocab_size (`int`, *optional*): Vocabulary size of the audio part of model. Defines the number of different tokens that can be represented by the `audio_codes` passed when calling the Moshi models. @@ -142,81 +130,69 @@ class MoshiConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] sub_configs = {"audio_encoder_config": AutoConfig, "depth_decoder_config": MoshiDepthConfig} - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - audio_vocab_size: int | None = None, - max_position_embeddings: int | None = 3000, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - hidden_act: str | None = "silu", - head_dim: int | None = None, - initializer_range: float | None = 0.02, - use_cache: bool | None = True, - sliding_window: int | None = 3000, - attention_dropout: float | None = 0.0, - ffn_dim: int | None = 22528, - rms_norm_eps: int | None = 1e-8, - num_codebooks: int | None = 8, - tie_word_embeddings: bool | None = False, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.hidden_act = hidden_act - self.head_dim = head_dim or hidden_size // num_attention_heads - self.initializer_range = initializer_range - self.use_cache = use_cache - self.sliding_window = sliding_window - self.attention_dropout = attention_dropout - if ffn_dim % 2 == 1: - raise ValueError(f"`ffn_dim={ffn_dim}` must be even.") - self.ffn_dim = ffn_dim - self.rms_norm_eps = rms_norm_eps - self.num_codebooks = num_codebooks - self.rope_parameters = rope_parameters - - audio_encoder_config = kwargs.pop("audio_encoder_config", {}) - audio_encoder_model_type = audio_encoder_config.pop("model_type", "mimi") - - self.audio_encoder_config = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config) + vocab_size: int = 32000 + hidden_size: int = 4096 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + audio_vocab_size: int | None = None + max_position_embeddings: int = 3000 + rope_parameters: RopeParameters | dict | None = None + hidden_act: str = "silu" + head_dim: int | None = None + initializer_range: float = 0.02 + use_cache: bool = True + sliding_window: int = 3000 + attention_dropout: float | int = 0.0 + ffn_dim: int = 22528 + rms_norm_eps: float = 1e-8 + num_codebooks: int = 8 + tie_word_embeddings: bool = False + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + audio_encoder_config: dict | PreTrainedConfig | None = None + depth_decoder_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): + self.num_key_value_heads = ( + self.num_key_value_heads if self.num_key_value_heads is not None else self.num_attention_heads + ) + self.head_dim = self.head_dim or self.hidden_size // self.num_attention_heads - if self.num_codebooks > self.audio_encoder_config.num_codebooks: - raise ValueError( - f"`num_codebooks={num_codebooks}` is greater than the maximum number of codebooks that the audio encoder can deal with ({self.audio_encoder_config.num_codebooks}). Please lower it." - ) + if isinstance(self.audio_encoder_config, dict): + audio_encoder_model_type = self.audio_encoder_config.pop("model_type", "mimi") + self.audio_encoder_config = AutoConfig.for_model(audio_encoder_model_type, **self.audio_encoder_config) + elif self.audio_encoder_config is None: + self.audio_encoder_config = AutoConfig.for_model("mimi") self.audio_vocab_size = ( - self.audio_encoder_config.codebook_size if audio_vocab_size is None else audio_vocab_size + self.audio_encoder_config.codebook_size if self.audio_vocab_size is None else self.audio_vocab_size ) - depth_decoder_config = kwargs.pop("depth_decoder_config", {}) - depth_decoder_config.update( - { - "audio_vocab_size": self.audio_vocab_size, - "input_size": hidden_size, - "vocab_size": vocab_size, - "num_codebooks": num_codebooks, - } - ) + if isinstance(self.depth_decoder_config, dict): + self.depth_decoder_config.update( + { + "audio_vocab_size": self.audio_vocab_size, + "input_size": self.hidden_size, + "vocab_size": self.vocab_size, + "num_codebooks": self.num_codebooks, + } + ) + self.depth_decoder_config = MoshiDepthConfig(**self.depth_decoder_config) + elif self.depth_decoder_config is None: + self.depth_decoder_config = MoshiDepthConfig() + super().__post_init__(**kwargs) - self.depth_decoder_config = MoshiDepthConfig(**depth_decoder_config) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.ffn_dim % 2 == 1: + raise ValueError(f"`ffn_dim={self.ffn_dim}` must be even.") - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + if self.num_codebooks > self.audio_encoder_config.num_codebooks: + raise ValueError( + f"`num_codebooks={self.num_codebooks}` is greater than the maximum number of codebooks that the audio encoder can deal with ({self.audio_encoder_config.num_codebooks}). Please lower it." + ) @property def sampling_rate(self): diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index f1262f0e502e..71ab65b40fe9 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -932,7 +932,7 @@ def forward( ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if self.gradient_checkpointing and self.training and use_cache: logger.warning_once( @@ -1072,7 +1072,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if self.gradient_checkpointing and self.training and use_cache: logger.warning_once( @@ -1211,7 +1211,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( @@ -1354,7 +1354,7 @@ def forward( >>> logits.shape # (bsz, seq_len, text_vocab_size) torch.Size([1, 1, 32000]) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict kwargs_audio_encoder = { argument[len("audio_encoder_")]: value diff --git a/src/transformers/models/mpnet/configuration_mpnet.py b/src/transformers/models/mpnet/configuration_mpnet.py index e2762395dc47..c4c34d61837e 100644 --- a/src/transformers/models/mpnet/configuration_mpnet.py +++ b/src/transformers/models/mpnet/configuration_mpnet.py @@ -14,14 +14,14 @@ # limitations under the License. """MPNet model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/mpnet-base") +@strict(accept_kwargs=True) class MPNetConfig(PreTrainedConfig): r""" relative_attention_num_buckets (`int`, *optional*, defaults to 32): @@ -44,44 +44,22 @@ class MPNetConfig(PreTrainedConfig): model_type = "mpnet" - def __init__( - self, - vocab_size=30527, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - initializer_range=0.02, - layer_norm_eps=1e-12, - relative_attention_num_buckets=32, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.relative_attention_num_buckets = relative_attention_num_buckets + vocab_size: int = 30527 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + relative_attention_num_buckets: int = 32 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + tie_word_embeddings: bool = True __all__ = ["MPNetConfig"] diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index a0b871192945..25dc58bb99fa 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -415,7 +415,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -496,7 +496,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mpnet( input_ids, @@ -587,7 +587,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mpnet( input_ids, @@ -683,7 +683,7 @@ def forward( `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -758,7 +758,7 @@ def forward( Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mpnet( input_ids, @@ -837,7 +837,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mpnet( input_ids, diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py index 1aff8a32b0be..ce75f0b77e20 100644 --- a/src/transformers/models/mpt/configuration_mpt.py +++ b/src/transformers/models/mpt/configuration_mpt.py @@ -13,14 +13,16 @@ # limitations under the License. """Mpt configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from typing import Literal +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="mosaicml/mpt-7b") +@strict(accept_kwargs=True) class MptAttentionConfig(PreTrainedConfig): """ attn_type (`str`, *optional*, defaults to `"multihead_attention"`): @@ -52,39 +54,20 @@ class MptAttentionConfig(PreTrainedConfig): base_config_key = "attn_config" - def __init__( - self, - attn_type="multihead_attention", - attn_pdrop=0, - attn_impl="torch", - clip_qkv=None, - softmax_scale=None, - prefix_lm=False, - qk_ln=False, - attn_uses_sequence_id=False, - alibi=True, - alibi_bias_max=8, - **kwargs, - ): - super().__init__() - self.attn_type = attn_type - self.attn_pdrop = attn_pdrop - self.attn_impl = attn_impl - self.clip_qkv = clip_qkv - self.softmax_scale = softmax_scale - self.prefix_lm = prefix_lm - self.attn_uses_sequence_id = attn_uses_sequence_id - self.alibi = alibi - self.qk_ln = qk_ln - self.alibi_bias_max = alibi_bias_max - - if attn_type not in ["multihead_attention", "multiquery_attention"]: - raise ValueError( - f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}" - ) + attn_type: Literal["multihead_attention", "multiquery_attention"] = "multihead_attention" + attn_pdrop: int = 0 + attn_impl: str = "torch" + clip_qkv: float | None = None + softmax_scale: float | None = None + prefix_lm: bool = False + qk_ln: bool = False + attn_uses_sequence_id: bool = False + alibi: bool = True + alibi_bias_max: int = 8 @auto_docstring(checkpoint="mosaicml/mpt-7b") +@strict(accept_kwargs=True) class MptConfig(PreTrainedConfig): """ expansion_ratio (`int`, *optional*, defaults to 4): @@ -133,60 +116,35 @@ class MptConfig(PreTrainedConfig): "num_hidden_layers": "n_layers", } - def __init__( - self, - d_model: int = 2048, - n_heads: int = 16, - n_layers: int = 24, - expansion_ratio: int = 4, - max_seq_len: int = 2048, - vocab_size: int = 50368, - resid_pdrop: float = 0.0, - layer_norm_epsilon: float = 1e-5, - emb_pdrop: float = 0.0, - learned_pos_emb: bool = True, - attn_config: MptAttentionConfig = None, - init_device: str = "cpu", - logit_scale: float | str | None = None, - no_bias: bool = True, - embedding_fraction: float = 1.0, - norm_type: str = "low_precision_layernorm", - use_cache: bool = False, - initializer_range=0.02, - tie_word_embeddings=True, - pad_token_id=None, - bos_token_id=None, - eos_token_id=None, - **kwargs, - ): - if attn_config is None: + d_model: int = 2048 + n_heads: int = 16 + n_layers: int = 24 + expansion_ratio: int = 4 + max_seq_len: int = 2048 + vocab_size: int = 50368 + resid_pdrop: float = 0.0 + layer_norm_epsilon: float = 1e-5 + emb_pdrop: float = 0.0 + learned_pos_emb: bool = True + attn_config: dict | MptAttentionConfig | None = None + init_device: str = "cpu" + logit_scale: float | str | None = None + no_bias: bool = True + embedding_fraction: float = 1.0 + norm_type: str = "low_precision_layernorm" + use_cache: bool = False + initializer_range: float = 0.02 + tie_word_embeddings: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): + if self.attn_config is None: self.attn_config = MptAttentionConfig() - elif isinstance(attn_config, dict): - self.attn_config = MptAttentionConfig(**attn_config) - else: - self.attn_config = attn_config - self.d_model = d_model - self.n_heads = n_heads - self.n_layers = n_layers - self.expansion_ratio = expansion_ratio - self.max_seq_len = max_seq_len - self.vocab_size = vocab_size - self.resid_pdrop = resid_pdrop - self.emb_pdrop = emb_pdrop - self.learned_pos_emb = learned_pos_emb - self.init_device = init_device - self.logit_scale = logit_scale - self.no_bias = no_bias - self.embedding_fraction = embedding_fraction - self.norm_type = norm_type - self.layer_norm_epsilon = layer_norm_epsilon - self.use_cache = use_cache - self.initializer_range = initializer_range - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + elif isinstance(self.attn_config, dict): + self.attn_config = MptAttentionConfig(**self.attn_config) + super().__post_init__(**kwargs) __all__ = ["MptConfig"] diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py index 7dfb1ee749ec..592b3e2947ea 100644 --- a/src/transformers/models/mpt/modeling_mpt.py +++ b/src/transformers/models/mpt/modeling_mpt.py @@ -287,7 +287,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -423,7 +423,7 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -517,7 +517,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -641,7 +641,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -716,7 +716,7 @@ def forward( [What are input IDs?](../glossary#input-ids) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, diff --git a/src/transformers/models/mra/configuration_mra.py b/src/transformers/models/mra/configuration_mra.py index c5356ab5a0d8..8b444dc9ab1f 100644 --- a/src/transformers/models/mra/configuration_mra.py +++ b/src/transformers/models/mra/configuration_mra.py @@ -13,14 +13,14 @@ # limitations under the License. """MRA model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="uw-madison/mra-base-512-4") +@strict(accept_kwargs=True) class MraConfig(PreTrainedConfig): r""" block_per_row (`int`, *optional*, defaults to 4): @@ -50,54 +50,27 @@ class MraConfig(PreTrainedConfig): model_type = "mra" - def __init__( - self, - vocab_size=50265, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=1, - initializer_range=0.02, - layer_norm_eps=1e-5, - block_per_row=4, - approx_mode="full", - initial_prior_first_n_blocks=0, - initial_prior_diagonal_n_blocks=0, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.type_vocab_size = type_vocab_size - self.layer_norm_eps = layer_norm_eps - self.block_per_row = block_per_row - self.approx_mode = approx_mode - self.initial_prior_first_n_blocks = initial_prior_first_n_blocks - self.initial_prior_diagonal_n_blocks = initial_prior_diagonal_n_blocks + vocab_size: int = 50265 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + block_per_row: int = 4 + approx_mode: str = "full" + initial_prior_first_n_blocks: int = 0 + initial_prior_diagonal_n_blocks: int = 0 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["MraConfig"] diff --git a/src/transformers/models/mra/modeling_mra.py b/src/transformers/models/mra/modeling_mra.py index a8c00cf091b1..f36174d09792 100644 --- a/src/transformers/models/mra/modeling_mra.py +++ b/src/transformers/models/mra/modeling_mra.py @@ -832,7 +832,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -929,7 +929,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mra( input_ids, @@ -1018,7 +1018,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mra( input_ids, @@ -1122,7 +1122,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1200,7 +1200,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mra( input_ids, @@ -1271,7 +1271,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.mra( input_ids, diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py index db087a45ef11..971402b2e742 100644 --- a/src/transformers/models/mt5/configuration_mt5.py +++ b/src/transformers/models/mt5/configuration_mt5.py @@ -13,14 +13,14 @@ # limitations under the License. """mT5 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/mt5-small") +@strict(accept_kwargs=True) class MT5Config(PreTrainedConfig): r""" relative_attention_num_buckets (`int`, *optional*, defaults to 32): @@ -42,77 +42,57 @@ class MT5Config(PreTrainedConfig): "head_dim": "d_kv", } - def __init__( - self, - vocab_size=250112, - d_model=512, - d_kv=64, - d_ff=1024, - num_layers=8, - num_decoder_layers=None, - num_heads=6, - relative_attention_num_buckets=32, - relative_attention_max_distance=128, - dropout_rate=0.1, - layer_norm_epsilon=1e-6, - initializer_factor=1.0, - feed_forward_proj="gated-gelu", - is_encoder_decoder=True, - use_cache=True, - tokenizer_class="T5Tokenizer", - bos_token_id=None, - pad_token_id=0, - eos_token_id=1, - decoder_start_token_id=0, - classifier_dropout=0.0, - is_decoder=False, - **kwargs, - ): - self.is_decoder = is_decoder - self.vocab_size = vocab_size - self.d_model = d_model - self.d_kv = d_kv - self.d_ff = d_ff - self.num_layers = num_layers + vocab_size: int = 250112 + d_model: int = 512 + d_kv: int = 64 + d_ff: int = 1024 + num_layers: int = 8 + num_decoder_layers: int | None = None + num_heads: int = 6 + relative_attention_num_buckets: int = 32 + relative_attention_max_distance: int = 128 + dropout_rate: float = 0.1 + layer_norm_epsilon: float = 1e-6 + initializer_factor: float = 1.0 + feed_forward_proj: str = "gated-gelu" + is_encoder_decoder: bool = True + use_cache: bool = True + tokenizer_class: str = "T5Tokenizer" + tie_word_embeddings: bool = True + bos_token_id: int | None = None + pad_token_id: int | None = 0 + eos_token_id: int | None = 1 + decoder_start_token_id: int | None = 0 + classifier_dropout: float | int = 0.0 + is_decoder: bool = False + + def __post_init__(self, **kwargs): self.num_decoder_layers = ( - num_decoder_layers if num_decoder_layers is not None else self.num_layers + self.num_decoder_layers if self.num_decoder_layers is not None else self.num_layers ) # default = symmetry - self.num_heads = num_heads - self.relative_attention_num_buckets = relative_attention_num_buckets - self.relative_attention_max_distance = relative_attention_max_distance - self.dropout_rate = dropout_rate - self.classifier_dropout = classifier_dropout - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_factor = initializer_factor - self.feed_forward_proj = feed_forward_proj - self.use_cache = use_cache + act_info = self.feed_forward_proj.split("-") self.dense_act_fn = act_info[-1] self.is_gated_act = act_info[0] == "gated" - if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2: - raise ValueError( - f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer. " - "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. " - "'gated-gelu' or 'relu'" - ) - - # for backwards compatibility - if feed_forward_proj == "gated-gelu": + if self.feed_forward_proj == "gated-gelu": self.dense_act_fn = "gelu_new" # Force because official weights have False serialized, but we have to tie always kwargs.pop("tie_word_embeddings", None) self.tie_word_embeddings = True - self.tokenizer_class = tokenizer_class - self.bos_token_id = bos_token_id - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - super().__init__( - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + act_info = self.feed_forward_proj.split("-") + + if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2: + raise ValueError( + f"`feed_forward_proj`: {self.feed_forward_proj} is not a valid activation function of the dense layer. " + "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. " + "'gated-gelu' or 'relu'" + ) __all__ = ["MT5Config"] diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index 6510c1becfe0..58ee53fa1039 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -652,7 +652,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: err_msg_prefix = "decoder_" if self.is_decoder else "" @@ -914,7 +914,7 @@ def forward( >>> last_hidden_states = outputs.last_hidden_state ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1099,7 +1099,7 @@ def forward( >>> # studies have shown that owning a dog is good for you. ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1250,7 +1250,7 @@ def forward( >>> outputs = model(input_ids=input_ids) >>> last_hidden_states = outputs.last_hidden_state ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict encoder_outputs = self.encoder( input_ids=input_ids, @@ -1331,7 +1331,7 @@ def forward( Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: use_cache = False @@ -1457,7 +1457,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, @@ -1577,7 +1577,7 @@ def forward( Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict use_cache = use_cache if use_cache is not None else self.config.use_cache if start_positions is not None and end_positions is not None: use_cache = False @@ -1595,7 +1595,7 @@ def forward( decoder_input_ids = self._shift_right(input_ids) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py index bdf8cfd64241..5563c2728437 100644 --- a/src/transformers/models/musicgen/configuration_musicgen.py +++ b/src/transformers/models/musicgen/configuration_musicgen.py @@ -13,15 +13,17 @@ # limitations under the License. """MusicGen model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging -from ..auto.configuration_auto import AutoConfig +from typing import ClassVar +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring +from ..auto.configuration_auto import AutoConfig @auto_docstring(checkpoint="facebook/musicgen-small") +@strict(accept_kwargs=True) class MusicgenDecoderConfig(PreTrainedConfig): r""" audio_channels (`int`, *optional*, defaults to 1 @@ -33,64 +35,38 @@ class MusicgenDecoderConfig(PreTrainedConfig): base_config_key = "decoder_config" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size=2048, - max_position_embeddings=2048, - num_hidden_layers=24, - ffn_dim=4096, - num_attention_heads=16, - layerdrop=0.0, - use_cache=True, - activation_function="gelu", - hidden_size=1024, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - initializer_factor=0.02, - scale_embedding=False, - num_codebooks=4, - audio_channels=1, - pad_token_id=2048, - bos_token_id=2048, - eos_token_id=None, - tie_word_embeddings=False, - is_decoder=False, - add_cross_attention=False, - cross_attention_hidden_size=None, - **kwargs, - ): - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.cross_attention_hidden_size = cross_attention_hidden_size - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.ffn_dim = ffn_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.initializer_factor = initializer_factor - self.layerdrop = layerdrop - self.use_cache = use_cache - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.num_codebooks = num_codebooks - - if audio_channels not in [1, 2]: - raise ValueError(f"Expected 1 (mono) or 2 (stereo) audio channels, got {audio_channels} channels.") - self.audio_channels = audio_channels - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 2048 + max_position_embeddings: int = 2048 + num_hidden_layers: int = 24 + ffn_dim: int = 4096 + num_attention_heads: int = 16 + layerdrop: float | int = 0.0 + use_cache: bool = True + activation_function: str = "gelu" + hidden_size: int = 1024 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + initializer_factor: float = 0.02 + scale_embedding: bool = False + num_codebooks: int = 4 + audio_channels: int = 1 + pad_token_id: int | None = 2048 + bos_token_id: int | None = 2048 + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = False + is_decoder: bool = False + add_cross_attention: bool = False + cross_attention_hidden_size: int | None = None + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.audio_channels not in [1, 2]: + raise ValueError(f"Expected 1 (mono) or 2 (stereo) audio channels, got {self.audio_channels} channels.") @auto_docstring(checkpoint="facebook/musicgen-small") +@strict(accept_kwargs=True) class MusicgenConfig(PreTrainedConfig): r""" text_encoder (`Union[dict, `PretrainedConfig`]`): @@ -139,34 +115,43 @@ class MusicgenConfig(PreTrainedConfig): >>> model = MusicgenForConditionalGeneration.from_pretrained("musicgen-model", config=musicgen_config) ```""" - model_type = "musicgen" - sub_configs = { + model_type: ClassVar[str] = "musicgen" + sub_configs: ClassVar[dict[str, type[PreTrainedConfig]]] = { "text_encoder": AutoConfig, "audio_encoder": AutoConfig, "decoder": MusicgenDecoderConfig, } - has_no_defaults_at_init = True - - def __init__(self, text_encoder, audio_encoder, decoder, **kwargs): - if isinstance(text_encoder, dict): - text_encoder_model_type = text_encoder.pop("model_type") - text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder) - - if isinstance(audio_encoder, dict): - audio_encoder_model_type = audio_encoder.pop("model_type") - audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder) - - if isinstance(decoder, dict): - decoder = MusicgenDecoderConfig(**decoder) - - self.text_encoder = text_encoder - self.audio_encoder = audio_encoder - self.decoder = decoder - self.initializer_factor = self.decoder.initializer_factor - self.tie_encoder_decoder = kwargs.get("tie_encoder_decoder", False) - - kwargs["is_encoder_decoder"] = True - super().__init__(**kwargs) + has_no_defaults_at_init: ClassVar[bool] = True + + text_encoder: dict | PreTrainedConfig = None + audio_encoder: dict | PreTrainedConfig = None + decoder: dict | PreTrainedConfig = None + initializer_factor: float = 0.02 + + def __post_init__(self, **kwargs): + if isinstance(self.text_encoder, dict): + text_encoder_model_type = self.text_encoder.pop("model_type") + self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **self.text_encoder) + elif self.text_encoder is None: + raise ValueError( + f"A configuration of type {self.model_type} cannot be instantiated because text_encoder is not passed" + ) + + if isinstance(self.audio_encoder, dict): + audio_encoder_model_type = self.audio_encoder.pop("model_type") + self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **self.audio_encoder) + elif self.audio_encoder is None: + raise ValueError( + f"A configuration of type {self.model_type} cannot be instantiated because audio_encoder is not passed" + ) + + if isinstance(self.decoder, dict): + self.decoder = MusicgenDecoderConfig(**self.decoder) + elif self.decoder is None: + self.decoder = MusicgenDecoderConfig() + + self.is_encoder_decoder = True + super().__post_init__(**kwargs) @property # This is a property because you might want to change the codec model on the fly diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py index ca9bc53d5721..6fa512e66d0a 100644 --- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py @@ -13,15 +13,15 @@ # limitations under the License. """Musicgen Melody model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto.configuration_auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="facebook/musicgen-melody") +@strict(accept_kwargs=True) class MusicgenMelodyDecoderConfig(PreTrainedConfig): r""" audio_channels (`int`, *optional*, defaults to 1): @@ -33,62 +33,37 @@ class MusicgenMelodyDecoderConfig(PreTrainedConfig): base_config_key = "decoder_config" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size=2048, - max_position_embeddings=2048, - num_hidden_layers=24, - ffn_dim=4096, - num_attention_heads=16, - layerdrop=0.0, - use_cache=True, - activation_function="gelu", - hidden_size=1024, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - initializer_factor=0.02, - scale_embedding=False, - num_codebooks=4, - audio_channels=1, - pad_token_id=2048, - bos_token_id=2048, - eos_token_id=None, - tie_word_embeddings=False, - is_decoder=False, - add_cross_attention=False, - **kwargs, - ): - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.ffn_dim = ffn_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.initializer_factor = initializer_factor - self.layerdrop = layerdrop - self.use_cache = use_cache - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.num_codebooks = num_codebooks - - if audio_channels not in [1, 2]: - raise ValueError(f"Expected 1 (mono) or 2 (stereo) audio channels, got {audio_channels} channels.") - self.audio_channels = audio_channels - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 2048 + max_position_embeddings: int = 2048 + num_hidden_layers: int = 24 + ffn_dim: int = 4096 + num_attention_heads: int = 16 + layerdrop: float | int = 0.0 + use_cache: bool = True + activation_function: str = "gelu" + hidden_size: int = 1024 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + initializer_factor: float = 0.02 + scale_embedding: bool = False + num_codebooks: int = 4 + audio_channels: int = 1 + pad_token_id: int | None = 2048 + bos_token_id: int | None = 2048 + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = False + is_decoder: bool = False + add_cross_attention: bool = False + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.audio_channels not in [1, 2]: + raise ValueError(f"Expected 1 (mono) or 2 (stereo) audio channels, got {self.audio_channels} channels.") @auto_docstring(checkpoint="facebook/musicgen-melody") +@strict(accept_kwargs=True) class MusicgenMelodyConfig(PreTrainedConfig): r""" text_encoder (`Union[dict, `PretrainedConfig`]`): @@ -147,34 +122,37 @@ class MusicgenMelodyConfig(PreTrainedConfig): } has_no_defaults_at_init = True - def __init__( - self, - text_encoder, - audio_encoder, - decoder, - num_chroma=12, - chroma_length=235, - **kwargs, - ): - if isinstance(text_encoder, dict): - text_encoder_model_type = text_encoder.pop("model_type") - text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder) - - if isinstance(audio_encoder, dict): - audio_encoder_model_type = audio_encoder.pop("model_type") - audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder) - - if isinstance(decoder, dict): - decoder = MusicgenMelodyDecoderConfig(**decoder) - - self.text_encoder = text_encoder - self.audio_encoder = audio_encoder - self.decoder = decoder - self.num_chroma = num_chroma - self.chroma_length = chroma_length - self.tie_encoder_decoder = kwargs.get("tie_encoder_decoder", False) - kwargs["is_encoder_decoder"] = False - super().__init__(**kwargs) + text_encoder: dict | PreTrainedConfig = None + audio_encoder: dict | PreTrainedConfig = None + decoder: dict | PreTrainedConfig = None + num_chroma: int = 12 + chroma_length: int = 235 + initializer_factor: float = 0.02 + + def __post_init__(self, **kwargs): + if isinstance(self.text_encoder, dict): + text_encoder_model_type = self.text_encoder.pop("model_type") + self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **self.text_encoder) + elif self.text_encoder is None: + raise ValueError( + f"A configuration of type {self.model_type} cannot be instantiated because text_encoder is not passed" + ) + + if isinstance(self.audio_encoder, dict): + audio_encoder_model_type = self.audio_encoder.pop("model_type") + self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **self.audio_encoder) + elif self.audio_encoder is None: + raise ValueError( + f"A configuration of type {self.model_type} cannot be instantiated because audio_encoder is not passed" + ) + + if isinstance(self.decoder, dict): + self.decoder = MusicgenMelodyDecoderConfig(**self.decoder) + elif self.decoder is None: + self.decoder = MusicgenMelodyDecoderConfig() + + self.is_encoder_decoder = True + super().__post_init__(**kwargs) @property # This is a property because you might want to change the codec model on the fly diff --git a/src/transformers/models/mvp/configuration_mvp.py b/src/transformers/models/mvp/configuration_mvp.py index 03925963d83b..d0ae7390e664 100644 --- a/src/transformers/models/mvp/configuration_mvp.py +++ b/src/transformers/models/mvp/configuration_mvp.py @@ -13,14 +13,14 @@ # limitations under the License. """MVP model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="RUCAIBox/mvp") +@strict(accept_kwargs=True) class MvpConfig(PreTrainedConfig): r""" use_prompt (`bool`, *optional*, defaults to `False`): @@ -47,75 +47,41 @@ class MvpConfig(PreTrainedConfig): model_type = "mvp" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} - - def __init__( - self, - vocab_size=50267, - max_position_embeddings=1024, - encoder_layers=12, - encoder_ffn_dim=4096, - encoder_attention_heads=16, - decoder_layers=12, - decoder_ffn_dim=4096, - decoder_attention_heads=16, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - activation_function="gelu", - d_model=1024, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - classifier_dropout=0.0, - scale_embedding=False, - use_cache=True, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - is_encoder_decoder=True, - decoder_start_token_id=2, - use_prompt=False, - prompt_length=100, - prompt_mid_dim=800, - is_decoder=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.classifier_dropout = classifier_dropout - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.use_prompt = use_prompt - self.prompt_length = prompt_length - self.prompt_mid_dim = prompt_mid_dim + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", + } - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - super().__init__( - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) + vocab_size: int = 50267 + max_position_embeddings: int = 1024 + encoder_layers: int = 12 + encoder_ffn_dim: int = 4096 + encoder_attention_heads: int = 16 + decoder_layers: int = 12 + decoder_ffn_dim: int = 4096 + decoder_attention_heads: int = 16 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + activation_function: str = "gelu" + d_model: int = 1024 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + classifier_dropout: float | int = 0.0 + scale_embedding: bool = False + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + is_encoder_decoder: int = True + decoder_start_token_id: int | None = 2 + use_prompt: bool = False + prompt_length: int = 100 + prompt_mid_dim: int = 800 + is_decoder: bool = False + tie_word_embeddings: bool = True __all__ = ["MvpConfig"] diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index fb4ad396c989..1e581b9ea8ec 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -567,7 +567,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: @@ -756,7 +756,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: @@ -974,7 +974,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: encoder_outputs = self.encoder( @@ -1136,7 +1136,7 @@ def forward( >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: if use_cache: @@ -1282,7 +1282,7 @@ def forward( >>> predicted_class_id = logits.argmax() ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: use_cache = False @@ -1449,7 +1449,7 @@ def forward( >>> predict_answer = tokenizer.decode(predict_answer_tokens) ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if start_positions is not None and end_positions is not None: use_cache = False @@ -1596,7 +1596,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model.decoder( diff --git a/src/transformers/models/nanochat/configuration_nanochat.py b/src/transformers/models/nanochat/configuration_nanochat.py index df00a75fce45..9f305fd5f485 100644 --- a/src/transformers/models/nanochat/configuration_nanochat.py +++ b/src/transformers/models/nanochat/configuration_nanochat.py @@ -12,12 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PretrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="karpathy/nanochat-d32") +@strict(accept_kwargs=True) class NanoChatConfig(PretrainedConfig): r""" Example: @@ -47,55 +51,31 @@ class NanoChatConfig(PretrainedConfig): "layers.*.mlp.fc2": "rowwise", } - def __init__( - self, - vocab_size: int = 50304, - hidden_size: int = 768, - intermediate_size: int | None = 8192, - num_hidden_layers: int = 12, - num_attention_heads: int = 6, - num_key_value_heads: int | None = None, - max_position_embeddings: int = 2048, - hidden_act: str = "relu2", - attention_dropout: float = 0.0, - rms_norm_eps: float = 1e-6, - initializer_range: float = 0.02, - rope_parameters: RopeParameters | dict | None = None, - use_cache: bool = True, - final_logit_softcapping: float | None = 15.0, - attention_bias: bool = False, - bos_token_id: int = 0, - eos_token_id: int = 1, - pad_token_id: int = 1, - tie_word_embeddings: bool = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.max_position_embeddings = max_position_embeddings - self.hidden_act = hidden_act - self.attention_dropout = attention_dropout - self.rms_norm_eps = rms_norm_eps - self.initializer_range = initializer_range - self.use_cache = use_cache - self.final_logit_softcapping = final_logit_softcapping - self.attention_bias = attention_bias - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + vocab_size: int = 50304 + hidden_size: int = 768 + intermediate_size: int = 8192 + num_hidden_layers: int = 12 + num_attention_heads: int = 6 + num_key_value_heads: int | None = None + max_position_embeddings: int = 2048 + hidden_act: str = "relu2" + attention_dropout: float | int = 0.0 + rms_norm_eps: float = 1e-6 + initializer_range: float = 0.02 + rope_parameters: RopeParameters | dict | None = None + use_cache: bool = True + final_logit_softcapping: float | None = 15.0 + attention_bias: bool = False + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + pad_token_id: int | None = 1 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) __all__ = ["NanoChatConfig"] diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index bb0570446c87..8bdfc8759c5e 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -14,15 +14,15 @@ # limitations under the License. """Nemotron model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="thhaus/nemotron3-8b") +@strict(accept_kwargs=True) class NemotronConfig(PreTrainedConfig): r""" Example: @@ -43,53 +43,31 @@ class NemotronConfig(PreTrainedConfig): model_type = "nemotron" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 6144, - intermediate_size: int | None = 24576, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 48, - head_dim: int | None = None, - num_key_value_heads: int | None = None, - hidden_act: str | None = "relu2", - max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.0134, - norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 2, - eos_token_id: int | None = 3, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.norm_eps = norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.rope_parameters = rope_parameters - kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC + vocab_size: int = 256000 + hidden_size: int = 6144 + intermediate_size: int = 24576 + num_hidden_layers: int = 32 + num_attention_heads: int = 48 + head_dim: int | None = None + num_key_value_heads: int | None = None + hidden_act: str = "relu2" + max_position_embeddings: int = 4096 + initializer_range: float = 0.0134 + norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 3 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + mlp_bias: bool = False - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + def __post_init__(self, **kwargs): + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC + super().__post_init__(**kwargs) __all__ = ["NemotronConfig"] diff --git a/src/transformers/models/nllb_moe/configuration_nllb_moe.py b/src/transformers/models/nllb_moe/configuration_nllb_moe.py index 24b76fa84617..9128e185d36e 100644 --- a/src/transformers/models/nllb_moe/configuration_nllb_moe.py +++ b/src/transformers/models/nllb_moe/configuration_nllb_moe.py @@ -13,14 +13,16 @@ # limitations under the License. """NLLB-MoE model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from typing import Literal +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/nllb-moe-54b") +@strict(accept_kwargs=True) class NllbMoeConfig(PreTrainedConfig): r""" second_expert_policy ( `str`, *optional*, default to `"all"`): @@ -70,94 +72,51 @@ class NllbMoeConfig(PreTrainedConfig): model_type = "nllb-moe" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} - - def __init__( - self, - vocab_size=128112, - max_position_embeddings=1024, - encoder_layers=12, - encoder_ffn_dim=4096, - encoder_attention_heads=16, - decoder_layers=12, - decoder_ffn_dim=4096, - decoder_attention_heads=16, - encoder_layerdrop=0.05, - decoder_layerdrop=0.05, - use_cache=True, - is_encoder_decoder=True, - activation_function="relu", - d_model=1024, - dropout=0.1, - attention_dropout=0.1, - activation_dropout=0.0, - init_std=0.02, - decoder_start_token_id=2, - scale_embedding=True, - router_bias=False, - router_dtype="float32", - router_ignore_padding_tokens=False, - num_experts=128, - expert_capacity=64, - encoder_sparse_step=4, - decoder_sparse_step=4, - router_z_loss_coef=0.001, - router_aux_loss_coef=0.001, - second_expert_policy="all", - normalize_router_prob_before_dropping=False, - batch_prioritized_routing=False, - moe_eval_capacity_token_fraction=1.0, - moe_token_dropout=0.2, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=True, - output_router_logits=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.router_z_loss_coef = router_z_loss_coef - self.router_aux_loss_coef = router_aux_loss_coef - self.decoder_sparse_step = decoder_sparse_step - self.encoder_sparse_step = encoder_sparse_step - self.num_experts = num_experts - self.expert_capacity = expert_capacity - self.router_bias = router_bias - if router_dtype not in ["float32", "float16", "bfloat16"]: - raise ValueError(f"`router_dtype` must be one of 'float32', 'float16' or 'bfloat16', got {router_dtype}") - self.router_dtype = router_dtype + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", + } - self.router_ignore_padding_tokens = router_ignore_padding_tokens - self.batch_prioritized_routing = batch_prioritized_routing - self.second_expert_policy = second_expert_policy - self.normalize_router_prob_before_dropping = normalize_router_prob_before_dropping - self.moe_eval_capacity_token_fraction = moe_eval_capacity_token_fraction - self.moe_token_dropout = moe_token_dropout - self.output_router_logits = output_router_logits - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + vocab_size: int = 128112 + max_position_embeddings: int = 1024 + encoder_layers: int = 12 + encoder_ffn_dim: int = 4096 + encoder_attention_heads: int = 16 + decoder_layers: int = 12 + decoder_ffn_dim: int = 4096 + decoder_attention_heads: int = 16 + encoder_layerdrop: float | int = 0.05 + decoder_layerdrop: float | int = 0.05 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "relu" + d_model: int = 1024 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + decoder_start_token_id: int | None = 2 + scale_embedding: bool = True + router_bias: bool = False + router_dtype: Literal["float32", "float16", "bfloat16"] = "float32" + router_ignore_padding_tokens: bool = False + num_experts: int = 128 + expert_capacity: int = 64 + encoder_sparse_step: int = 4 + decoder_sparse_step: int = 4 + router_z_loss_coef: float = 0.001 + router_aux_loss_coef: float = 0.001 + second_expert_policy: str = "all" + normalize_router_prob_before_dropping: bool = False + batch_prioritized_routing: bool = False + moe_eval_capacity_token_fraction: float = 1.0 + moe_token_dropout: float | int = 0.2 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + tie_word_embeddings: bool = True + output_router_logits: bool = False __all__ = ["NllbMoeConfig"] diff --git a/src/transformers/models/nystromformer/configuration_nystromformer.py b/src/transformers/models/nystromformer/configuration_nystromformer.py index 8cdd1feee735..e935c7f253c9 100644 --- a/src/transformers/models/nystromformer/configuration_nystromformer.py +++ b/src/transformers/models/nystromformer/configuration_nystromformer.py @@ -13,14 +13,14 @@ # limitations under the License. """Nystromformer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="uw-madison/nystromformer-512") +@strict(accept_kwargs=True) class NystromformerConfig(PreTrainedConfig): r""" segment_means_seq_len (`int`, *optional*, defaults to 64): @@ -51,53 +51,27 @@ class NystromformerConfig(PreTrainedConfig): model_type = "nystromformer" - def __init__( - self, - vocab_size=30000, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu_new", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=510, - type_vocab_size=2, - segment_means_seq_len=64, - num_landmarks=64, - conv_kernel_size=65, - inv_coeff_init_option=False, - initializer_range=0.02, - layer_norm_eps=1e-5, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - self.add_cross_attention = add_cross_attention - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.type_vocab_size = type_vocab_size - self.segment_means_seq_len = segment_means_seq_len - self.num_landmarks = num_landmarks - self.conv_kernel_size = conv_kernel_size - self.inv_coeff_init_option = inv_coeff_init_option - self.layer_norm_eps = layer_norm_eps - super().__init__(**kwargs) + vocab_size: int = 30000 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu_new" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 510 + type_vocab_size: int = 2 + segment_means_seq_len: int = 64 + num_landmarks: int = 64 + conv_kernel_size: int = 65 + inv_coeff_init_option: bool = False + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["NystromformerConfig"] diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py index 3420457b4148..33623c990fcc 100755 --- a/src/transformers/models/nystromformer/modeling_nystromformer.py +++ b/src/transformers/models/nystromformer/modeling_nystromformer.py @@ -454,7 +454,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -553,7 +553,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.nystromformer( input_ids, @@ -643,7 +643,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.nystromformer( input_ids, @@ -749,7 +749,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -829,7 +829,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.nystromformer( input_ids, @@ -893,7 +893,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.nystromformer( input_ids, diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index 5d7377e05a5e..254c1f8a9199 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -18,15 +18,15 @@ # limitations under the License. """OLMo model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="allenai/OLMo-7B-hf") +@strict(accept_kwargs=True) class OlmoConfig(PreTrainedConfig): r""" clip_qkv (`float`, *optional*): @@ -63,53 +63,29 @@ class OlmoConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 50304, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - use_cache: bool | None = True, - pad_token_id: int | None = 1, - bos_token_id: int | None = None, - eos_token_id: int | None = 50279, - tie_word_embeddings: int | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - clip_qkv: bool | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.clip_qkv = clip_qkv - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 50304 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = 50279 + tie_word_embeddings: int = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + clip_qkv: float | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["OlmoConfig"] diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index 0dbfa1e5f833..b11a59a6b1cd 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -23,12 +23,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="allenai/Olmo2-7B-1124-hf") +@strict(accept_kwargs=True) class Olmo2Config(PreTrainedConfig): r""" Example: @@ -64,54 +67,30 @@ class Olmo2Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 50304, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - use_cache: bool | None = True, - pad_token_id: int | None = 1, - bos_token_id: int | None = None, - eos_token_id: int | None = 50279, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - rms_norm_eps: int | None = 1e-5, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters + vocab_size: int = 50304 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = 50279 + tie_word_embeddings: int = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + rms_norm_eps: float = 1e-5 - self.rms_norm_eps = rms_norm_eps + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["Olmo2Config"] diff --git a/src/transformers/models/olmo2/modular_olmo2.py b/src/transformers/models/olmo2/modular_olmo2.py index 814d6a1bdec2..c44449921f34 100644 --- a/src/transformers/models/olmo2/modular_olmo2.py +++ b/src/transformers/models/olmo2/modular_olmo2.py @@ -21,11 +21,11 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from transformers.utils.generic import TransformersKwargs from ...cache_utils import Cache -from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import auto_docstring, logging @@ -45,6 +45,7 @@ @auto_docstring(checkpoint="allenai/Olmo2-7B-1124-hf") +@strict(accept_kwargs=True) class Olmo2Config(OlmoConfig): r""" Example: @@ -79,51 +80,8 @@ class Olmo2Config(OlmoConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 50304, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - use_cache: bool | None = True, - pad_token_id: int | None = 1, - bos_token_id: int | None = None, - eos_token_id: int | None = 50279, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - rms_norm_eps: int | None = 1e-5, - **kwargs, - ): - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - use_cache=use_cache, - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - **kwargs, - ) - - self.rms_norm_eps = rms_norm_eps - del self.clip_qkv + rms_norm_eps: float = 1e-5 + clip_qkv = AttributeError() # OLMo2 RMS norm is identical to Llama RMS norm except: diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py index 5fcfd16959bb..e986bf7b26a7 100644 --- a/src/transformers/models/olmo3/configuration_olmo3.py +++ b/src/transformers/models/olmo3/configuration_olmo3.py @@ -18,12 +18,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="allenai/Olmo-3-7B-Instruct") +@strict(accept_kwargs=True) class Olmo3Config(PreTrainedConfig): r""" Example: @@ -59,64 +62,40 @@ class Olmo3Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 50304, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - use_cache: bool | None = True, - pad_token_id: int | None = 1, - bos_token_id: int | None = None, - eos_token_id: int | None = 50279, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - rms_norm_eps: float | None = 1e-5, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.rms_norm_eps = rms_norm_eps - self.sliding_window = sliding_window - self.layer_types = layer_types + vocab_size: int = 50304 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = 50279 + tie_word_embeddings: int = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + + rms_norm_eps: float = 1e-5 + + sliding_window: int | None = 4096 + layer_types: list[str] | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + if self.layer_types is None: self.layer_types = [ "sliding_attention" if (i + 1) % 4 != 0 else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["Olmo3Config"] diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py index 4f1dd96b4d28..4be52a8cab52 100644 --- a/src/transformers/models/olmo3/modular_olmo3.py +++ b/src/transformers/models/olmo3/modular_olmo3.py @@ -16,17 +16,17 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import auto_docstring from ...utils.generic import TransformersKwargs from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding +from ..olmo2.configuration_olmo2 import Olmo2Config from ..olmo2.modeling_olmo2 import ( Olmo2Attention, Olmo2DecoderLayer, @@ -40,7 +40,8 @@ @auto_docstring(checkpoint="allenai/Olmo-3-7B-Instruct") -class Olmo3Config(PreTrainedConfig): +@strict(accept_kwargs=True) +class Olmo3Config(Olmo2Config): r""" Example: @@ -75,64 +76,19 @@ class Olmo3Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 50304, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - use_cache: bool | None = True, - pad_token_id: int | None = 1, - bos_token_id: int | None = None, - eos_token_id: int | None = 50279, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - rms_norm_eps: float | None = 1e-5, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.rms_norm_eps = rms_norm_eps - self.sliding_window = sliding_window - self.layer_types = layer_types + sliding_window: int | None = 4096 + layer_types: list[str] | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + if self.layer_types is None: self.layer_types = [ "sliding_attention" if (i + 1) % 4 != 0 else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) class Olmo3RMSNorm(Olmo2RMSNorm): diff --git a/src/transformers/models/olmo_hybrid/configuration_olmo_hybrid.py b/src/transformers/models/olmo_hybrid/configuration_olmo_hybrid.py index 1733f1e80efc..29a6f3a503fa 100644 --- a/src/transformers/models/olmo_hybrid/configuration_olmo_hybrid.py +++ b/src/transformers/models/olmo_hybrid/configuration_olmo_hybrid.py @@ -18,11 +18,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring +from ...utils.type_validators import interval @auto_docstring(checkpoint="allenai/Olmo-Hybrid-7B") +@strict(accept_kwargs=True) class OlmoHybridConfig(PreTrainedConfig): r""" linear_num_key_heads (`int`, *optional*): @@ -82,103 +87,67 @@ class OlmoHybridConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 100352, - hidden_size: int | None = 3840, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 30, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 65536, - initializer_range: float | None = 0.02, - use_cache: bool | None = True, - pad_token_id: int | None = 100277, - bos_token_id: int | None = None, - eos_token_id: int | None = 100257, - tie_word_embeddings: bool | None = False, - rope_parameters=None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - rms_norm_eps: float | None = 1e-06, - layer_types: list[str] | None = None, - linear_num_key_heads: int | None = None, - linear_num_value_heads: int | None = None, - linear_key_head_dim: int | None = None, - linear_value_head_dim: int | None = None, - linear_a_log_min: float = 0.0, - linear_a_log_max: float = 16.0, - linear_dt_min: float = 0.001, - linear_dt_max: float = 0.1, - linear_dt_init_floor: float = 1e-4, - linear_conv_kernel_dim: int = 4, - linear_allow_neg_eigval: bool = True, - **kwargs, - ): - if layer_types is None: + vocab_size: int = 100352 + hidden_size: int = 3840 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 30 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 65536 + initializer_range: float = interval(min=0.0, max=1.0)(default=0.02) + rms_norm_eps: float = 1e-06 + use_cache: bool = True + pad_token_id: int | None = 100277 + bos_token_id: int | None = None + eos_token_id: int | None = 100257 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + layer_types: list[str] | None = None + linear_num_key_heads: int | None = None + linear_num_value_heads: int | None = None + linear_key_head_dim: int | None = None + linear_value_head_dim: int | None = None + linear_a_log_min: float = 0.0 + linear_a_log_max: float = 16.0 + linear_dt_min: float = 0.001 + linear_dt_max: float = 0.1 + linear_dt_init_floor: float = 1e-4 + linear_conv_kernel_dim: int = 4 + linear_allow_neg_eigval: bool = True + + def __post_init__(self, **kwargs): + if self.layer_types is None: # Default: linear attention for most layers, full attention every 4th layer - layer_types = ["linear_attention"] * int(num_hidden_layers) - for i in range(int(num_hidden_layers)): + self.layer_types = ["linear_attention"] * int(self.num_hidden_layers) + for i in range(int(self.num_hidden_layers)): if i % 4 == 3: - layer_types[i] = "full_attention" + self.layer_types[i] = "full_attention" # Ensure at least one full attention layer for small num_hidden_layers - if "full_attention" not in layer_types: - layer_types[-1] = "full_attention" - - layer_type_validation(layer_types, num_hidden_layers) - if "linear_attention" not in layer_types: + if "full_attention" not in self.layer_types: + self.layer_types[-1] = "full_attention" + + if self.linear_num_key_heads is None: + self.linear_num_key_heads = self.num_attention_heads + if self.linear_num_value_heads is None: + self.linear_num_value_heads = self.num_attention_heads + if self.linear_key_head_dim is None: + self.linear_key_head_dim = int(0.75 * self.hidden_size / self.linear_num_key_heads) + if self.linear_value_head_dim is None: + self.linear_value_head_dim = 2 * self.linear_key_head_dim + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if "linear_attention" not in self.layer_types: raise ValueError("OLMoHybrid expects at least one 'linear_attention' layer.") - if all(t == "linear_attention" for t in layer_types): + if all(t == "linear_attention" for t in self.layer_types): raise ValueError("OLMoHybrid expects at least one attention layer.") - self.layer_types = layer_types - - if linear_num_key_heads is None: - linear_num_key_heads = num_attention_heads - if linear_num_value_heads is None: - linear_num_value_heads = num_attention_heads - if linear_key_head_dim is None: - linear_key_head_dim = int(0.75 * hidden_size / linear_num_key_heads) - if linear_value_head_dim is None: - linear_value_head_dim = 2 * linear_key_head_dim - - self.linear_num_key_heads = linear_num_key_heads - self.linear_num_value_heads = linear_num_value_heads - self.linear_key_head_dim = linear_key_head_dim - self.linear_value_head_dim = linear_value_head_dim - self.linear_a_log_min = linear_a_log_min - self.linear_a_log_max = linear_a_log_max - self.linear_dt_min = linear_dt_min - self.linear_dt_max = linear_dt_max - self.linear_dt_init_floor = linear_dt_init_floor - self.linear_conv_kernel_dim = linear_conv_kernel_dim - self.linear_allow_neg_eigval = linear_allow_neg_eigval - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) - __all__ = ["OlmoHybridConfig"] diff --git a/src/transformers/models/olmo_hybrid/modeling_olmo_hybrid.py b/src/transformers/models/olmo_hybrid/modeling_olmo_hybrid.py index f23bb8b42245..09fd0312b02c 100644 --- a/src/transformers/models/olmo_hybrid/modeling_olmo_hybrid.py +++ b/src/transformers/models/olmo_hybrid/modeling_olmo_hybrid.py @@ -53,6 +53,7 @@ FusedRMSNormGated = None ShortConvolution = None + logger = logging.get_logger(__name__) diff --git a/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py b/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py index 9e3dc808a57d..f9c9fc9dd1f3 100644 --- a/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py +++ b/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py @@ -20,11 +20,12 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...activations import ACT2FN from ...cache_utils import Cache -from ...configuration_utils import layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import BaseModelOutputWithPast from ...modeling_rope_utils import dynamic_rope_update @@ -69,10 +70,12 @@ (ShortConvolution, chunk_gated_delta_rule, fused_recurrent_gated_delta_rule, FusedRMSNormGated) ) + logger = logging.get_logger(__name__) @auto_docstring(checkpoint="allenai/Olmo-Hybrid-7B") +@strict(accept_kwargs=True) class OlmoHybridConfig(LlamaConfig): r""" linear_num_key_heads (`int`, *optional*): @@ -126,104 +129,65 @@ class OlmoHybridConfig(LlamaConfig): "layers.*.mlp.down_proj": "rowwise", } - def __init__( - self, - vocab_size: int | None = 100352, - hidden_size: int | None = 3840, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 30, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 65536, - initializer_range: float | None = 0.02, - use_cache: bool | None = True, - pad_token_id: int | None = 100277, - bos_token_id: int | None = None, - eos_token_id: int | None = 100257, - tie_word_embeddings: bool | None = False, - rope_parameters=None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - rms_norm_eps: float | None = 1e-06, - layer_types: list[str] | None = None, - linear_num_key_heads: int | None = None, - linear_num_value_heads: int | None = None, - linear_key_head_dim: int | None = None, - linear_value_head_dim: int | None = None, - linear_a_log_min: float = 0.0, - linear_a_log_max: float = 16.0, - linear_dt_min: float = 0.001, - linear_dt_max: float = 0.1, - linear_dt_init_floor: float = 1e-4, - linear_conv_kernel_dim: int = 4, - linear_allow_neg_eigval: bool = True, - **kwargs, - ): - if layer_types is None: + vocab_size: int = 100352 + hidden_size: int = 3840 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 30 + num_key_value_heads: int | None = None + max_position_embeddings: int = 65536 + pad_token_id: int | None = 100277 + bos_token_id: int | None = None + eos_token_id: int | None = 100257 + rms_norm_eps: float = 1e-06 + layer_types: list[str] | None = None + linear_num_key_heads: int | None = None + linear_num_value_heads: int | None = None + linear_key_head_dim: int | None = None + linear_value_head_dim: int | None = None + linear_a_log_min: float = 0.0 + linear_a_log_max: float = 16.0 + linear_dt_min: float = 0.001 + linear_dt_max: float = 0.1 + linear_dt_init_floor: float = 1e-4 + linear_conv_kernel_dim: int = 4 + linear_allow_neg_eigval: bool = True + + pretraining_tp = AttributeError() + mlp_bias = AttributeError() + head_dim = AttributeError() + + def __post_init__(self, **kwargs): + if self.layer_types is None: # Default: linear attention for most layers, full attention every 4th layer - layer_types = ["linear_attention"] * int(num_hidden_layers) - for i in range(int(num_hidden_layers)): + self.layer_types = ["linear_attention"] * int(self.num_hidden_layers) + for i in range(int(self.num_hidden_layers)): if i % 4 == 3: - layer_types[i] = "full_attention" + self.layer_types[i] = "full_attention" # Ensure at least one full attention layer for small num_hidden_layers - if "full_attention" not in layer_types: - layer_types[-1] = "full_attention" - - layer_type_validation(layer_types, num_hidden_layers) - if "linear_attention" not in layer_types: + if "full_attention" not in self.layer_types: + self.layer_types[-1] = "full_attention" + + if self.linear_num_key_heads is None: + self.linear_num_key_heads = self.num_attention_heads + if self.linear_num_value_heads is None: + self.linear_num_value_heads = self.num_attention_heads + if self.linear_key_head_dim is None: + self.linear_key_head_dim = int(0.75 * self.hidden_size / self.linear_num_key_heads) + if self.linear_value_head_dim is None: + self.linear_value_head_dim = 2 * self.linear_key_head_dim + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + PreTrainedConfig.__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if "linear_attention" not in self.layer_types: raise ValueError("OLMoHybrid expects at least one 'linear_attention' layer.") - if all(t == "linear_attention" for t in layer_types): + if all(t == "linear_attention" for t in self.layer_types): raise ValueError("OLMoHybrid expects at least one attention layer.") - self.layer_types = layer_types - - if linear_num_key_heads is None: - linear_num_key_heads = num_attention_heads - if linear_num_value_heads is None: - linear_num_value_heads = num_attention_heads - if linear_key_head_dim is None: - linear_key_head_dim = int(0.75 * hidden_size / linear_num_key_heads) - if linear_value_head_dim is None: - linear_value_head_dim = 2 * linear_key_head_dim - - self.linear_num_key_heads = linear_num_key_heads - self.linear_num_value_heads = linear_num_value_heads - self.linear_key_head_dim = linear_key_head_dim - self.linear_value_head_dim = linear_value_head_dim - self.linear_a_log_min = linear_a_log_min - self.linear_a_log_max = linear_a_log_max - self.linear_dt_min = linear_dt_min - self.linear_dt_max = linear_dt_max - self.linear_dt_init_floor = linear_dt_init_floor - self.linear_conv_kernel_dim = linear_conv_kernel_dim - self.linear_allow_neg_eigval = linear_allow_neg_eigval - - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - use_cache=use_cache, - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - rms_norm_eps=rms_norm_eps, - rope_parameters=rope_parameters, - **kwargs, - ) - del self.pretraining_tp - del self.mlp_bias - del self.head_dim - class OlmoHybridDynamicCache(Qwen3NextDynamicCache): """ diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index dbec7da51ec2..d9a837d9957b 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -11,12 +11,15 @@ # limitations under the License. """OLMoE model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="allenai/OLMoE-1B-7B-0924") +@strict(accept_kwargs=True) class OlmoeConfig(PreTrainedConfig): r""" clip_qkv (`float`, *optional*): @@ -40,65 +43,35 @@ class OlmoeConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] attribute_map = {"num_local_experts": "num_experts"} - def __init__( - self, - vocab_size: int | None = 50304, - hidden_size: int | None = 2048, - intermediate_size: int | None = 2048, - num_hidden_layers: int | None = 16, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-05, - use_cache: bool | None = True, - pad_token_id: int | None = 1, - bos_token_id: int | None = None, - eos_token_id: int | None = 50279, - tie_word_embeddings: int | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - clip_qkv: bool | None = None, - num_experts_per_tok: int | None = 8, - num_experts: int | None = 64, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.01, - norm_topk_prob: bool | None = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.clip_qkv = clip_qkv - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.norm_topk_prob = norm_topk_prob - self.rope_parameters = rope_parameters + vocab_size: int = 50304 + hidden_size: int = 2048 + intermediate_size: int = 2048 + num_hidden_layers: int = 16 + num_attention_heads: int = 16 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 4096 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = 50279 + tie_word_embeddings: int = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + clip_qkv: float | None = None + num_experts_per_tok: int = 8 + num_experts: int = 64 + output_router_logits: bool = False + router_aux_loss_coef: float = 0.01 + norm_topk_prob: bool = False - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["OlmoeConfig"] diff --git a/src/transformers/models/omdet_turbo/configuration_omdet_turbo.py b/src/transformers/models/omdet_turbo/configuration_omdet_turbo.py index 794174931202..145a5071320e 100644 --- a/src/transformers/models/omdet_turbo/configuration_omdet_turbo.py +++ b/src/transformers/models/omdet_turbo/configuration_omdet_turbo.py @@ -13,6 +13,10 @@ # limitations under the License. """OmDet-Turbo model configuration""" +from typing import Literal + +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -23,6 +27,7 @@ @auto_docstring(checkpoint="omlab/omdet-turbo-swin-tiny-hf") +@strict(accept_kwargs=True) class OmDetTurboConfig(PreTrainedConfig): r""" apply_layernorm_after_vision_backbone (`bool`, *optional*, defaults to `True`): @@ -102,126 +107,78 @@ class OmDetTurboConfig(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - text_config=None, - backbone_config=None, - apply_layernorm_after_vision_backbone=True, - image_size=640, - disable_custom_kernels=False, - layer_norm_eps=1e-5, - batch_norm_eps=1e-5, - init_std=0.02, - text_projection_in_dim=512, - text_projection_out_dim=512, - task_encoder_hidden_dim=1024, - class_embed_dim=512, - class_distance_type="cosine", - num_queries=900, - csp_activation="silu", - conv_norm_activation="gelu", - encoder_feedforward_activation="relu", - encoder_feedforward_dropout=0.0, - encoder_dropout=0.0, - hidden_expansion=1, - vision_features_channels=[256, 256, 256], - encoder_hidden_dim=256, - encoder_in_channels=[192, 384, 768], - encoder_projection_indices=[2], - encoder_attention_heads=8, - encoder_dim_feedforward=2048, - encoder_layers=1, - positional_encoding_temperature=10000, - num_feature_levels=3, - decoder_hidden_dim=256, - decoder_num_heads=8, - decoder_num_layers=6, - decoder_activation="relu", - decoder_dim_feedforward=2048, - decoder_num_points=4, - decoder_dropout=0.0, - eval_size=None, - learn_initial_query=False, - cache_size=100, - is_encoder_decoder=True, - **kwargs, - ): + text_config: dict | PreTrainedConfig | None = None + backbone_config: dict | PreTrainedConfig | None = None + apply_layernorm_after_vision_backbone: bool = True + image_size: int | list[int] | tuple[int, int] = 640 + disable_custom_kernels: bool = False + layer_norm_eps: float = 1e-5 + batch_norm_eps: float = 1e-5 + init_std: float = 0.02 + text_projection_in_dim: int = 512 + text_projection_out_dim: int = 512 + task_encoder_hidden_dim: int = 1024 + class_embed_dim: int = 512 + class_distance_type: Literal["cosine", "dot"] = "cosine" + num_queries: int = 900 + csp_activation: str = "silu" + conv_norm_activation: str = "gelu" + encoder_feedforward_activation: str = "relu" + encoder_feedforward_dropout: float | int = 0.0 + encoder_dropout: float | int = 0.0 + hidden_expansion: int = 1 + encoder_hidden_dim: int = 256 + vision_features_channels: list[int] | tuple[int, ...] = (256, 256, 256) + encoder_in_channels: list[int] | tuple[int, ...] = (192, 384, 768) + encoder_projection_indices: list[int] | tuple[int, ...] = (2,) + encoder_attention_heads: int = 8 + encoder_dim_feedforward: int = 2048 + encoder_layers: int = 1 + positional_encoding_temperature: int = 10000 + num_feature_levels: int = 3 + decoder_hidden_dim: int = 256 + decoder_num_heads: int = 8 + decoder_num_layers: int = 6 + decoder_activation: str = "relu" + decoder_dim_feedforward: int = 2048 + decoder_num_points: int = 4 + decoder_dropout: float | int = 0.0 + eval_size: int | None = None + learn_initial_query: int = False + cache_size: int = 100 + is_encoder_decoder: bool = True + + def __post_init__(self, **kwargs): # Init timm backbone with hardcoded values for BC timm_default_kwargs = { "out_indices": [1, 2, 3], - "img_size": image_size, + "img_size": self.image_size, "always_partition": True, } - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_backbone="swin_tiny_patch4_window7_224", default_config_type="swin", - default_config_kwargs={"image_size": image_size, "out_indices": [2, 3, 4]}, + default_config_kwargs={"image_size": self.image_size, "out_indices": [2, 3, 4]}, timm_default_kwargs=timm_default_kwargs, **kwargs, ) # Extract timm.create_model kwargs; TimmBackbone doesn't forward arbitrary config attrs to timm - timm_kwargs = {} - if getattr(backbone_config, "model_type", None) == "timm_backbone": + self.timm_kwargs = {} + if getattr(self.backbone_config, "model_type", None) == "timm_backbone": for attr in ("img_size", "always_partition"): - if hasattr(backbone_config, attr): - timm_kwargs[attr] = getattr(backbone_config, attr) + if hasattr(self.backbone_config, attr): + self.timm_kwargs[attr] = getattr(self.backbone_config, attr) - if text_config is None: + if self.text_config is None: logger.info("`text_config` is `None`. Initializing the config with the default `clip_text_model`") - text_config = CONFIG_MAPPING["clip_text_model"]() - elif isinstance(text_config, dict): - text_model_type = text_config.get("model_type") - text_config = CONFIG_MAPPING[text_model_type](**text_config) - - if class_distance_type not in ["cosine", "dot"]: - raise ValueError( - f"Invalid `class_distance_type`. It should be either `cosine` or `dot`, but got {class_distance_type}." - ) - - self.text_config = text_config - self.backbone_config = backbone_config - self.apply_layernorm_after_vision_backbone = apply_layernorm_after_vision_backbone - self.image_size = image_size - self.disable_custom_kernels = disable_custom_kernels - self.layer_norm_eps = layer_norm_eps - self.batch_norm_eps = batch_norm_eps - self.init_std = init_std - self.text_projection_in_dim = text_projection_in_dim - self.text_projection_out_dim = text_projection_out_dim - self.task_encoder_hidden_dim = task_encoder_hidden_dim - self.class_embed_dim = class_embed_dim - self.class_distance_type = class_distance_type - self.num_queries = num_queries - self.csp_activation = csp_activation - self.conv_norm_activation = conv_norm_activation - self.encoder_feedforward_activation = encoder_feedforward_activation - self.encoder_feedforward_dropout = encoder_feedforward_dropout - self.encoder_dropout = encoder_dropout - self.hidden_expansion = hidden_expansion - self.vision_features_channels = vision_features_channels - self.encoder_hidden_dim = encoder_hidden_dim - self.encoder_in_channels = encoder_in_channels - self.encoder_projection_indices = encoder_projection_indices - self.encoder_attention_heads = encoder_attention_heads - self.encoder_dim_feedforward = encoder_dim_feedforward - self.encoder_layers = encoder_layers - self.positional_encoding_temperature = positional_encoding_temperature - self.num_feature_levels = num_feature_levels - self.decoder_hidden_dim = decoder_hidden_dim - self.decoder_num_heads = decoder_num_heads - self.decoder_num_layers = decoder_num_layers - self.decoder_activation = decoder_activation - self.decoder_dim_feedforward = decoder_dim_feedforward - self.decoder_num_points = decoder_num_points - self.decoder_dropout = decoder_dropout - self.eval_size = eval_size - self.learn_initial_query = learn_initial_query - self.cache_size = cache_size - self.timm_kwargs = timm_kwargs - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + self.text_config = CONFIG_MAPPING["clip_text_model"]() + elif isinstance(self.text_config, dict): + text_model_type = self.text_config.get("model_type") + self.text_config = CONFIG_MAPPING[text_model_type](**self.text_config) + + super().__post_init__(**kwargs) def to_dict(self): output = super().to_dict() diff --git a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py index 1819c7aa873d..d06b94edcc9f 100644 --- a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py +++ b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py @@ -743,7 +743,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict hidden_states = inputs_embeddings @@ -1341,7 +1341,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict vision_features, vision_shapes, vision_shapes_list, level_start_index = self._get_encoder_input( vision_features @@ -1588,7 +1588,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict loss = None image_features = self.vision_backbone(pixel_values) diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py index ed979236000a..9ddb5504e97b 100644 --- a/src/transformers/models/oneformer/configuration_oneformer.py +++ b/src/transformers/models/oneformer/configuration_oneformer.py @@ -13,16 +13,16 @@ # limitations under the License. """OneFormer model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="shi-labs/oneformer_ade20k_swin_tiny") +@strict(accept_kwargs=True) class OneFormerConfig(PreTrainedConfig): r""" ignore_value (`int`, *optional*, defaults to 255): @@ -97,55 +97,53 @@ class OneFormerConfig(PreTrainedConfig): model_type = "oneformer" sub_configs = {"backbone_config": AutoConfig} - attribute_map = {"hidden_size": "hidden_dim"} + attribute_map = {"hidden_size": "hidden_dim", "num_hidden_layers": "decoder_layers"} - def __init__( - self, - backbone_config: dict | PreTrainedConfig | None = None, - ignore_value: int = 255, - num_queries: int = 150, - no_object_weight: int = 0.1, - class_weight: float = 2.0, - mask_weight: float = 5.0, - dice_weight: float = 5.0, - contrastive_weight: float = 0.5, - contrastive_temperature: float = 0.07, - train_num_points: int = 12544, - oversample_ratio: float = 3.0, - importance_sample_ratio: float = 0.75, - init_std: float = 0.02, - init_xavier_std: float = 1.0, - layer_norm_eps: float = 1e-05, - is_training: bool = False, - use_auxiliary_loss: bool = True, - output_auxiliary_logits: bool = True, - strides: list | None = [4, 8, 16, 32], - task_seq_len: int = 77, - text_encoder_width: int = 256, - text_encoder_context_length: int = 77, - text_encoder_num_layers: int = 6, - text_encoder_vocab_size: int = 49408, - text_encoder_proj_layers: int = 2, - text_encoder_n_ctx: int = 16, - conv_dim: int = 256, - mask_dim: int = 256, - hidden_dim: int = 256, - encoder_feedforward_dim: int = 1024, - norm: str = "GN", - encoder_layers: int = 6, - decoder_layers: int = 10, - use_task_norm: bool = True, - num_attention_heads: int = 8, - dropout: float = 0.1, - dim_feedforward: int = 2048, - pre_norm: bool = False, - enforce_input_proj: bool = False, - query_dec_layers: int = 2, - common_stride: int = 4, - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + ignore_value: int = 255 + num_queries: int = 150 + no_object_weight: float = 0.1 + class_weight: float = 2.0 + mask_weight: float = 5.0 + dice_weight: float = 5.0 + contrastive_weight: float = 0.5 + contrastive_temperature: float = 0.07 + train_num_points: int = 12544 + oversample_ratio: float = 3.0 + importance_sample_ratio: float = 0.75 + init_std: float = 0.02 + init_xavier_std: float = 1.0 + layer_norm_eps: float = 1e-05 + is_training: bool = False + use_auxiliary_loss: bool = True + output_auxiliary_logits: bool = True + strides: list[int] | tuple[int, ...] = (4, 8, 16, 32) + task_seq_len: int = 77 + text_encoder_width: int = 256 + text_encoder_context_length: int = 77 + text_encoder_num_layers: int = 6 + text_encoder_vocab_size: int = 49408 + text_encoder_proj_layers: int = 2 + text_encoder_n_ctx: int = 16 + conv_dim: int = 256 + mask_dim: int = 256 + hidden_dim: int = 256 + encoder_feedforward_dim: int = 1024 + norm: str = "GN" + encoder_layers: int = 6 + decoder_layers: int = 10 + use_task_norm: bool = True + num_attention_heads: int = 8 + dropout: float | int = 0.1 + dim_feedforward: int = 2048 + pre_norm: bool = False + enforce_input_proj: bool = False + query_dec_layers: int = 2 + common_stride: int = 4 + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="swin", default_config_kwargs={ "drop_path_rate": 0.3, @@ -154,50 +152,7 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - self.ignore_value = ignore_value - self.num_queries = num_queries - self.no_object_weight = no_object_weight - self.class_weight = class_weight - self.mask_weight = mask_weight - self.dice_weight = dice_weight - self.contrastive_weight = contrastive_weight - self.contrastive_temperature = contrastive_temperature - self.train_num_points = train_num_points - self.oversample_ratio = oversample_ratio - self.importance_sample_ratio = importance_sample_ratio - self.init_std = init_std - self.init_xavier_std = init_xavier_std - self.layer_norm_eps = layer_norm_eps - self.is_training = is_training - self.use_auxiliary_loss = use_auxiliary_loss - self.output_auxiliary_logits = output_auxiliary_logits - self.strides = strides - self.task_seq_len = task_seq_len - self.text_encoder_width = text_encoder_width - self.text_encoder_context_length = text_encoder_context_length - self.text_encoder_num_layers = text_encoder_num_layers - self.text_encoder_vocab_size = text_encoder_vocab_size - self.text_encoder_proj_layers = text_encoder_proj_layers - self.text_encoder_n_ctx = text_encoder_n_ctx - self.conv_dim = conv_dim - self.mask_dim = mask_dim - self.hidden_dim = hidden_dim - self.encoder_feedforward_dim = encoder_feedforward_dim - self.norm = norm - self.encoder_layers = encoder_layers - self.decoder_layers = decoder_layers - self.use_task_norm = use_task_norm - self.num_attention_heads = num_attention_heads - self.dropout = dropout - self.dim_feedforward = dim_feedforward - self.pre_norm = pre_norm - self.enforce_input_proj = enforce_input_proj - self.query_dec_layers = query_dec_layers - self.common_stride = common_stride - self.num_hidden_layers = decoder_layers - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["OneFormerConfig"] diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py index 2b4433e52fa9..57214df16d82 100644 --- a/src/transformers/models/oneformer/modeling_oneformer.py +++ b/src/transformers/models/oneformer/modeling_oneformer.py @@ -1210,7 +1210,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict hidden_states = inputs_embeds reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device) @@ -2887,7 +2887,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict batch_size, _, height, width = pixel_values.shape @@ -3125,7 +3125,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.model( pixel_values=pixel_values, diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py index dfb684e01a62..f2ed88b66a0c 100644 --- a/src/transformers/models/openai/configuration_openai.py +++ b/src/transformers/models/openai/configuration_openai.py @@ -14,14 +14,14 @@ # limitations under the License. """OpenAI GPT configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="openai-community/openai-gpt") +@strict(accept_kwargs=True) class OpenAIGPTConfig(PreTrainedConfig): """ afn (`str` or `Callable`, *optional*, defaults to `"gelu"`): @@ -55,7 +55,6 @@ class OpenAIGPTConfig(PreTrainedConfig): [`OpenAIGPTDoubleHeadsModel`]. The dropout ratio to be used after the projection and activation. - Examples: ```python @@ -79,51 +78,26 @@ class OpenAIGPTConfig(PreTrainedConfig): "num_hidden_layers": "n_layer", } - def __init__( - self, - vocab_size=40478, - n_positions=512, - n_embd=768, - n_layer=12, - n_head=12, - afn="gelu", - resid_pdrop=0.1, - embd_pdrop=0.1, - attn_pdrop=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - summary_type="cls_index", - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, - pad_token_id=None, - bos_token_id=None, - eos_token_id=None, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.afn = afn - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_first_dropout = summary_first_dropout - self.summary_proj_to_labels = summary_proj_to_labels - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vocab_size: int = 40478 + n_positions: int = 512 + n_embd: int = 768 + n_layer: int = 12 + n_head: int = 12 + afn: str = "gelu" + resid_pdrop: float = 0.1 + embd_pdrop: float = 0.1 + attn_pdrop: float = 0.1 + layer_norm_epsilon: float = 1e-5 + initializer_range: float = 0.02 + summary_type: str = "cls_index" + summary_use_proj: bool = True + summary_activation: str | None = None + summary_proj_to_labels: bool = True + summary_first_dropout: float | int = 0.1 + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = True __all__ = ["OpenAIGPTConfig"] diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py index 51a6c3ff0e01..0fc89733b282 100644 --- a/src/transformers/models/openai/modeling_openai.py +++ b/src/transformers/models/openai/modeling_openai.py @@ -334,7 +334,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -445,7 +445,7 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -561,7 +561,7 @@ def forward( >>> lm_logits = outputs.logits >>> mc_logits = outputs.mc_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -645,7 +645,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, diff --git a/src/transformers/models/opt/configuration_opt.py b/src/transformers/models/opt/configuration_opt.py index 426179e496b7..f377b9c84134 100644 --- a/src/transformers/models/opt/configuration_opt.py +++ b/src/transformers/models/opt/configuration_opt.py @@ -13,14 +13,14 @@ # limitations under the License. """OPT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/opt-350m") +@strict(accept_kwargs=True) class OPTConfig(PreTrainedConfig): r""" do_layer_norm_before (`bool`, *optional*, defaults to `True`): @@ -53,58 +53,33 @@ class OPTConfig(PreTrainedConfig): model_type = "opt" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size=50272, - hidden_size=768, - num_hidden_layers=12, - ffn_dim=3072, - max_position_embeddings=2048, - do_layer_norm_before=True, - _remove_final_layer_norm=False, - word_embed_proj_dim=None, - dropout=0.1, - attention_dropout=0.0, - num_attention_heads=12, - activation_function="relu", - layerdrop=0.0, - init_std=0.02, - use_cache=True, - pad_token_id=1, - bos_token_id=2, - eos_token_id=2, - enable_bias=True, - layer_norm_elementwise_affine=True, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.num_attention_heads = num_attention_heads - self.word_embed_proj_dim = word_embed_proj_dim if word_embed_proj_dim is not None else hidden_size - self.ffn_dim = ffn_dim - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_function = activation_function - self.init_std = init_std - self.layerdrop = layerdrop - self.use_cache = use_cache - self.do_layer_norm_before = do_layer_norm_before - # We keep these variables at `True` for backward compatibility. - self.enable_bias = enable_bias - self.layer_norm_elementwise_affine = layer_norm_elementwise_affine + vocab_size: int = 50272 + hidden_size: int = 768 + num_hidden_layers: int = 12 + ffn_dim: int = 3072 + max_position_embeddings: int = 2048 + do_layer_norm_before: bool = True + _remove_final_layer_norm: bool = False + word_embed_proj_dim: int | None = None + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + num_attention_heads: int = 12 + activation_function: str = "relu" + layerdrop: float | int = 0.0 + init_std: float = 0.02 + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 2 + eos_token_id: int | None = 2 + enable_bias: bool = True + layer_norm_elementwise_affine: bool = True + tie_word_embeddings: bool = True - # Note that the only purpose of `_remove_final_layer_norm` is to keep backward compatibility - # with checkpoints that have been fine-tuned before transformers v4.20.1 - # see https://github.com/facebookresearch/metaseq/pull/164 - self._remove_final_layer_norm = _remove_final_layer_norm + def __post_init__(self, **kwargs): + self.word_embed_proj_dim = ( + self.word_embed_proj_dim if self.word_embed_proj_dim is not None else self.hidden_size + ) + super().__post_init__(**kwargs) __all__ = ["OPTConfig"] diff --git a/src/transformers/models/ovis2/configuration_ovis2.py b/src/transformers/models/ovis2/configuration_ovis2.py index 921bba84f39d..63793428f397 100644 --- a/src/transformers/models/ovis2/configuration_ovis2.py +++ b/src/transformers/models/ovis2/configuration_ovis2.py @@ -12,12 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..qwen2.configuration_qwen2 import Qwen2Config @auto_docstring(checkpoint="thisisiron/Ovis2-1B-hf") +@strict(accept_kwargs=True) class Ovis2VisionConfig(PreTrainedConfig): r""" hidden_stride (`int`, *optional*, defaults to 1): @@ -30,50 +34,27 @@ class Ovis2VisionConfig(PreTrainedConfig): base_config_key = "vision_config" - def __init__( - self, - hidden_size: int = 1024, - intermediate_size: int = 2816, - num_hidden_layers: int = 24, - num_attention_heads: int = 8, - num_channels: int = 3, - image_size: int = 224, - patch_size: int = 14, - rms_norm_eps: float = 1e-5, - attention_dropout: float = 0.0, - qkv_bias: bool = False, - mlp_bias: bool = False, - hidden_act="silu", - vocab_size=16384, - hidden_stride=1, - num_visual_indicator_tokens=5, - initializer_range=0.02, - tokenize_function="softmax", - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - - self.attention_dropout = attention_dropout - self.hidden_act = hidden_act - self.qkv_bias = qkv_bias - self.mlp_bias = mlp_bias - self.rms_norm_eps = rms_norm_eps - self.vocab_size = vocab_size - self.hidden_stride = hidden_stride - self.num_visual_indicator_tokens = num_visual_indicator_tokens - self.tokenize_function = tokenize_function - self.initializer_range = initializer_range + hidden_size: int = 1024 + intermediate_size: int = 2816 + num_hidden_layers: int = 24 + num_attention_heads: int = 8 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 14 + rms_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + qkv_bias: bool = False + mlp_bias: bool = False + hidden_act: str = "silu" + vocab_size: int = 16384 + hidden_stride: int = 1 + num_visual_indicator_tokens: int = 5 + initializer_range: float = 0.02 + tokenize_function: str = "softmax" @auto_docstring(checkpoint="thisisiron/Ovis2-1B-hf") +@strict(accept_kwargs=True) class Ovis2Config(PreTrainedConfig): r""" visual_indicator_token_ids (`List[int]`, *optional*, defaults to `[151666, 151667, 151668, 151669, 151670]`): @@ -96,37 +77,26 @@ class Ovis2Config(PreTrainedConfig): model_type = "ovis2" sub_configs = {"text_config": Qwen2Config, "vision_config": Ovis2VisionConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_id=151665, - visual_indicator_token_ids=[151666, 151667, 151668, 151669, 151670], - vocab_size=151643, - hidden_size=1536, - tie_word_embeddings=True, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = Ovis2VisionConfig(**vision_config) - elif isinstance(vision_config, Ovis2VisionConfig): - self.vision_config = vision_config - if vision_config is None: - self.vision_config = Ovis2VisionConfig(num_visual_indicator_tokens=len(visual_indicator_token_ids)) - - if isinstance(text_config, dict): - self.text_config = Qwen2Config(**text_config) - elif isinstance(text_config, Qwen2Config): - self.text_config = text_config - elif text_config is None: + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151665 + visual_indicator_token_ids: list[int] | tuple[int, ...] = (151666, 151667, 151668, 151669, 151670) + vocab_size: int = 151643 + hidden_size: int = 1536 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = Ovis2VisionConfig(**self.vision_config) + if self.vision_config is None: + self.vision_config = Ovis2VisionConfig(num_visual_indicator_tokens=len(self.visual_indicator_token_ids)) + + if isinstance(self.text_config, dict): + self.text_config = Qwen2Config(**self.text_config) + elif self.text_config is None: self.text_config = Qwen2Config() - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.image_token_id = image_token_id - self.visual_indicator_token_ids = visual_indicator_token_ids - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Ovis2VisionConfig", "Ovis2Config"] diff --git a/src/transformers/models/owlv2/configuration_owlv2.py b/src/transformers/models/owlv2/configuration_owlv2.py index 19374821dcce..439e3f2a8848 100644 --- a/src/transformers/models/owlv2/configuration_owlv2.py +++ b/src/transformers/models/owlv2/configuration_owlv2.py @@ -13,6 +13,8 @@ # limitations under the License. """OWLv2 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -20,6 +22,7 @@ logger = logging.get_logger(__name__) +@strict(accept_kwargs=True) @auto_docstring(checkpoint="google/owlv2-base-patch16") # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTTextConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2 class Owlv2TextConfig(PreTrainedConfig): @@ -42,42 +45,23 @@ class Owlv2TextConfig(PreTrainedConfig): model_type = "owlv2_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=49408, - hidden_size=512, - intermediate_size=2048, - num_hidden_layers=12, - num_attention_heads=8, - max_position_embeddings=16, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - pad_token_id=0, - bos_token_id=49406, - eos_token_id=49407, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - - + vocab_size: int = 49408 + hidden_size: int = 512 + intermediate_size: int = 2048 + num_hidden_layers: int = 12 + num_attention_heads: int = 8 + max_position_embeddings: int = 16 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 + pad_token_id: int | None = 0 + bos_token_id: int | None = 49406 + eos_token_id: int | None = 49407 + + +@strict(accept_kwargs=True) @auto_docstring(checkpoint="google/owlv2-base-patch16") # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTVisionConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2, 32->16 class Owlv2VisionConfig(PreTrainedConfig): @@ -100,71 +84,48 @@ class Owlv2VisionConfig(PreTrainedConfig): model_type = "owlv2_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=768, - patch_size=16, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.image_size = image_size - self.patch_size = patch_size - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - - + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 768 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 + + +@strict(accept_kwargs=True) @auto_docstring(checkpoint="google/owlv2-base-patch16") # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2 class Owlv2Config(PreTrainedConfig): model_type = "owlv2" sub_configs = {"text_config": Owlv2TextConfig, "vision_config": Owlv2VisionConfig} - def __init__( - self, - text_config=None, - vision_config=None, - projection_dim=512, - logit_scale_init_value=2.6592, - **kwargs, - ): - if text_config is None: - text_config = Owlv2TextConfig() + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + projection_dim: int = 512 + logit_scale_init_value: float = 2.6592 + return_dict: bool = True + initializer_factor: float = 1.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = Owlv2TextConfig() logger.info("`text_config` is `None`. initializing the `Owlv2TextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = Owlv2TextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = Owlv2TextConfig(**self.text_config) - if vision_config is None: - vision_config = Owlv2VisionConfig() + if self.vision_config is None: + self.vision_config = Owlv2VisionConfig() logger.info("`vision_config` is `None`. initializing the `Owlv2VisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = Owlv2VisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config + elif isinstance(self.vision_config, dict): + self.vision_config = Owlv2VisionConfig(**self.vision_config) - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = 1.0 - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Owlv2Config", "Owlv2TextConfig", "Owlv2VisionConfig"] diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py index 745c0fe7404b..43dfcdc77b8c 100644 --- a/src/transformers/models/owlvit/configuration_owlvit.py +++ b/src/transformers/models/owlvit/configuration_owlvit.py @@ -13,6 +13,8 @@ # limitations under the License. """OWL-ViT model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="google/owlvit-base-patch16") +@strict(accept_kwargs=True) class OwlViTTextConfig(PreTrainedConfig): r""" Example: @@ -41,43 +44,24 @@ class OwlViTTextConfig(PreTrainedConfig): model_type = "owlvit_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=49408, - hidden_size=512, - intermediate_size=2048, - num_hidden_layers=12, - num_attention_heads=8, - max_position_embeddings=16, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - pad_token_id=0, - bos_token_id=49406, - eos_token_id=49407, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor + vocab_size: int = 49408 + hidden_size: int = 512 + intermediate_size: int = 2048 + num_hidden_layers: int = 12 + num_attention_heads: int = 8 + max_position_embeddings: int = 16 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 + pad_token_id: int | None = 0 + bos_token_id: int | None = 49406 + eos_token_id: int | None = 49407 @auto_docstring(checkpoint="google/owlvit-base-patch16") +@strict(accept_kwargs=True) class OwlViTVisionConfig(PreTrainedConfig): r""" Example: @@ -98,70 +82,47 @@ class OwlViTVisionConfig(PreTrainedConfig): model_type = "owlvit_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=768, - patch_size=32, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.image_size = image_size - self.patch_size = patch_size - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 768 + patch_size: int | list[int] | tuple[int, int] = 32 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 @auto_docstring(checkpoint="google/owlvit-base-patch16") +@strict(accept_kwargs=True) class OwlViTConfig(PreTrainedConfig): model_type = "owlvit" sub_configs = {"text_config": OwlViTTextConfig, "vision_config": OwlViTVisionConfig} - def __init__( - self, - text_config=None, - vision_config=None, - projection_dim=512, - logit_scale_init_value=2.6592, - **kwargs, - ): - if text_config is None: - text_config = OwlViTTextConfig() + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + projection_dim: int = 512 + logit_scale_init_value: float = 2.6592 + return_dict: bool = True + initializer_factor: float = 1.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = OwlViTTextConfig() logger.info("`text_config` is `None`. initializing the `OwlViTTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = OwlViTTextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = OwlViTTextConfig(**self.text_config) - if vision_config is None: - vision_config = OwlViTVisionConfig() + if self.vision_config is None: + self.vision_config = OwlViTVisionConfig() logger.info("`vision_config` is `None`. initializing the `OwlViTVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = OwlViTVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config + elif isinstance(self.vision_config, dict): + self.vision_config = OwlViTVisionConfig(**self.vision_config) - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = 1.0 - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["OwlViTConfig", "OwlViTTextConfig", "OwlViTVisionConfig"] diff --git a/src/transformers/models/paddleocr_vl/configuration_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/configuration_paddleocr_vl.py index ac5d1f333570..ef27eae49716 100644 --- a/src/transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +++ b/src/transformers/models/paddleocr_vl/configuration_paddleocr_vl.py @@ -25,12 +25,15 @@ import inspect +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="PaddlePaddle/PaddleOCR-VL") +@strict(accept_kwargs=True) class PaddleOCRVisionConfig(PreTrainedConfig): r""" Example: @@ -52,37 +55,21 @@ class PaddleOCRVisionConfig(PreTrainedConfig): model_type = "paddleocr_vl_vision" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1152, - intermediate_size=4304, - num_hidden_layers=27, - num_attention_heads=16, - num_channels=3, - image_size=384, - patch_size=14, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - spatial_merge_size=2, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.spatial_merge_size = spatial_merge_size + hidden_size: int = 1152 + intermediate_size: int = 4304 + num_hidden_layers: int = 27 + num_attention_heads: int = 16 + num_channels: int = 3 + image_size: int = 384 + patch_size: int = 14 + hidden_act: str = "gelu_pytorch_tanh" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + spatial_merge_size: int = 2 @auto_docstring(checkpoint="PaddlePaddle/PaddleOCR-VL") +@strict(accept_kwargs=True) class PaddleOCRTextConfig(PreTrainedConfig): r""" use_bias (`bool`, *optional*, defaults to `False`): @@ -122,56 +109,35 @@ class PaddleOCRTextConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 103424, - hidden_size: int | None = 1024, - intermediate_size: int | None = 3072, - num_hidden_layers: int | None = 18, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 2, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-05, - use_cache: int | None = True, - pad_token_id: int | None = 0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - use_bias: bool | None = False, - head_dim: int | None = 128, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.use_bias = use_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + vocab_size: int = 103424 + hidden_size: int = 1024 + intermediate_size: int = 3072 + num_hidden_layers: int = 18 + num_attention_heads: int = 16 + num_key_value_heads: int | None = 2 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + use_cache: int | None = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + use_bias: bool | None = False + head_dim: int | None = 128 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + super().__post_init__(**kwargs) @auto_docstring(checkpoint="PaddlePaddle/PaddleOCR-VL") +@strict(accept_kwargs=True) class PaddleOCRVLConfig(PreTrainedConfig): r""" Example: @@ -194,38 +160,34 @@ class PaddleOCRVLConfig(PreTrainedConfig): sub_configs = {"vision_config": PaddleOCRVisionConfig, "text_config": PaddleOCRTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=100295, - video_token_id=100296, - vision_start_token_id=101305, - vision_end_token_id=101306, - tie_word_embeddings=True, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + + image_token_id: int = 100295 + video_token_id: int = 100296 + vision_start_token_id: int = 101305 + vision_end_token_id: int = 101306 + tie_word_embeddings: int = True + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + # Hub configs are saved as flat dicts so we pop some of kwargs to init `TextConfig` + text_params = inspect.signature(self.sub_configs["text_config"].__init__).parameters.keys() + text_params = list(text_params) + ["rope_parameters", "rope_scaling", "rope_theta"] + text_kwargs = {key: kwargs.pop(key) for key in text_params if key in kwargs} + + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: # Hub configs are saved as flat dicts so we pop some of kwargs to init `TextConfig` - text_params = inspect.signature(self.sub_configs["text_config"].__init__).parameters.keys() - text_params = list(text_params) + ["rope_scaling", "rope_theta"] - text_config = {key: kwargs.pop(key) for key in text_params if key in kwargs} - text_config["dtype"] = kwargs.get("torch_dtype", kwargs.get("dtype")) # don't pop the dtype - self.text_config = self.sub_configs["text_config"](**text_config) - - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.vision_start_token_id = vision_start_token_id - self.vision_end_token_id = vision_end_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + text_kwargs["dtype"] = kwargs.get("torch_dtype", kwargs.get("dtype")) # don't pop the dtype + self.text_config = self.sub_configs["text_config"](**text_kwargs) + + super().__post_init__(**kwargs) __all__ = ["PaddleOCRVLConfig", "PaddleOCRVisionConfig", "PaddleOCRTextConfig"] diff --git a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py index 125419b4b7e5..0e9334bb5b29 100644 --- a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py +++ b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py @@ -23,6 +23,7 @@ import numpy as np import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -579,6 +580,7 @@ def __call__( @auto_docstring(checkpoint="PaddlePaddle/PaddleOCR-VL") +@strict(accept_kwargs=True) class PaddleOCRVisionConfig(SiglipVisionConfig): r""" Example: @@ -600,31 +602,23 @@ class PaddleOCRVisionConfig(SiglipVisionConfig): model_type = "paddleocr_vl_vision" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1152, - intermediate_size=4304, - num_hidden_layers=27, - num_attention_heads=16, - num_channels=3, - image_size=384, - patch_size=14, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - spatial_merge_size=2, - **kwargs, - ): - super().__init__() - self.spatial_merge_size = spatial_merge_size + hidden_size: int = 1152 + intermediate_size: int = 4304 + num_hidden_layers: int = 27 + num_attention_heads: int = 16 + image_size: int = 384 + patch_size: int = 14 + spatial_merge_size: int = 2 @auto_docstring(checkpoint="PaddlePaddle/PaddleOCR-VL") +@strict(accept_kwargs=True) class PaddleOCRTextConfig(Ernie4_5Config): model_type = "paddleocr_vl_text" @auto_docstring(checkpoint="PaddlePaddle/PaddleOCR-VL") +@strict(accept_kwargs=True) class PaddleOCRVLConfig(Qwen2VLConfig): r""" Example: @@ -644,18 +638,11 @@ class PaddleOCRVLConfig(Qwen2VLConfig): sub_configs = {"vision_config": PaddleOCRVisionConfig, "text_config": PaddleOCRTextConfig} - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=100295, - video_token_id=100296, - vision_start_token_id=101305, - vision_end_token_id=101306, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__() + image_token_id: int = 100295 + video_token_id: int = 100296 + vision_start_token_id: int = 101305 + vision_end_token_id: int = 101306 + tie_word_embeddings: int = True class PaddleOCRProjector(nn.Module): diff --git a/src/transformers/models/paligemma/configuration_paligemma.py b/src/transformers/models/paligemma/configuration_paligemma.py index e2035a2f08e4..8d9d789a799f 100644 --- a/src/transformers/models/paligemma/configuration_paligemma.py +++ b/src/transformers/models/paligemma/configuration_paligemma.py @@ -12,15 +12,15 @@ # limitations under the License. """PaliGemmamodel configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="google/paligemma-3b-pt-224") +@strict(accept_kwargs=True) class PaliGemmaConfig(PreTrainedConfig): r""" Example: @@ -51,28 +51,19 @@ class PaliGemmaConfig(PreTrainedConfig): sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vision_config=None, - text_config=None, - image_token_index=256000, - vocab_size=257152, - projection_dim=2048, - hidden_size=2048, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.image_token_index = image_token_index - self.projection_dim = projection_dim - self.hidden_size = hidden_size - self.vision_config = vision_config - self.tie_word_embeddings = tie_word_embeddings - self.is_encoder_decoder = False + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 256000 + vocab_size: int = 257152 + projection_dim: int = 2048 + hidden_size: int = 2048 + tie_word_embeddings: bool = True + def __post_init__(self, **kwargs): if isinstance(self.vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") - self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: + self.vision_config["model_type"] = self.vision_config.get("model_type", "siglip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: self.vision_config = CONFIG_MAPPING["siglip_vision_model"]( intermediate_size=4096, hidden_size=1152, @@ -84,11 +75,10 @@ def __init__( vision_use_head=False, ) - self.text_config = text_config if isinstance(self.text_config, dict): - text_config["model_type"] = text_config.get("model_type", "gemma") - self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: + self.text_config["model_type"] = self.text_config.get("model_type", "gemma") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: self.text_config = CONFIG_MAPPING["gemma"]( hidden_size=2048, num_hidden_layers=18, @@ -96,7 +86,7 @@ def __init__( num_attention_heads=8, num_key_value_heads=1, is_encoder_decoder=False, - vocab_size=vocab_size, + vocab_size=self.vocab_size, ) # BC: `use_bidirectional_attention` was originally unset in PaliGemma1 (backbone = Gemma1) AND PaliGemma2 @@ -105,8 +95,8 @@ def __init__( self.text_config.use_bidirectional_attention = True self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2 - self.vision_config.projection_dim = projection_dim - super().__init__(**kwargs) + self.vision_config.projection_dim = self.projection_dim + super().__post_init__(**kwargs) __all__ = ["PaliGemmaConfig"] diff --git a/src/transformers/models/parakeet/configuration_parakeet.py b/src/transformers/models/parakeet/configuration_parakeet.py index 0c181c98ac30..d2e95aeb9baf 100644 --- a/src/transformers/models/parakeet/configuration_parakeet.py +++ b/src/transformers/models/parakeet/configuration_parakeet.py @@ -13,14 +13,14 @@ # limitations under the License. """Parakeet model configuration.""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="nvidia/parakeet-ctc-1.1b") +@strict(accept_kwargs=True) class ParakeetEncoderConfig(PreTrainedConfig): r""" convolution_bias (`bool`, *optional*, defaults to `True`): @@ -63,63 +63,35 @@ class ParakeetEncoderConfig(PreTrainedConfig): model_type = "parakeet_encoder" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=8, - intermediate_size=4096, - hidden_act="silu", - attention_bias=True, - convolution_bias=True, - conv_kernel_size=9, - subsampling_factor=8, - subsampling_conv_channels=256, - num_mel_bins=80, - subsampling_conv_kernel_size=3, - subsampling_conv_stride=2, - dropout=0.1, - dropout_positions=0.0, - layerdrop=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - max_position_embeddings=5000, - scale_input=True, - initializer_range=0.02, - **kwargs, - ): - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_attention_heads # LlamaAttention compatibility - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.attention_bias = attention_bias - self.convolution_bias = convolution_bias - - self.conv_kernel_size = conv_kernel_size - self.subsampling_conv_kernel_size = subsampling_conv_kernel_size - self.subsampling_conv_stride = subsampling_conv_stride - - self.subsampling_factor = subsampling_factor - self.subsampling_conv_channels = subsampling_conv_channels - self.num_mel_bins = num_mel_bins - - self.dropout = dropout - self.dropout_positions = dropout_positions - self.layerdrop = layerdrop - self.activation_dropout = activation_dropout - self.attention_dropout = attention_dropout - self.max_position_embeddings = max_position_embeddings - self.scale_input = scale_input - self.initializer_range = initializer_range - - super().__init__( - **kwargs, - ) + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 8 + intermediate_size: int = 4096 + hidden_act: str = "silu" + attention_bias: bool = True + convolution_bias: bool = True + conv_kernel_size: int = 9 + subsampling_factor: int = 8 + subsampling_conv_channels: int = 256 + num_mel_bins: int = 80 + subsampling_conv_kernel_size: int = 3 + subsampling_conv_stride: int = 2 + dropout: float | int = 0.1 + dropout_positions: float = 0.0 + layerdrop: float | int = 0.1 + activation_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + max_position_embeddings: int = 5000 + scale_input: bool = True + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) @auto_docstring(checkpoint="nvidia/parakeet-ctc-1.1b") +@strict(accept_kwargs=True) class ParakeetCTCConfig(PreTrainedConfig): r""" ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`): @@ -148,40 +120,19 @@ class ParakeetCTCConfig(PreTrainedConfig): model_type = "parakeet_ctc" sub_configs = {"encoder_config": ParakeetEncoderConfig} - def __init__( - self, - vocab_size=1025, - ctc_loss_reduction="mean", - ctc_zero_infinity=True, - encoder_config: dict | ParakeetEncoderConfig = None, - pad_token_id=1024, - **kwargs, - ): - self.vocab_size = vocab_size - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - - if isinstance(encoder_config, dict): - self.encoder_config = ParakeetEncoderConfig(**encoder_config) - elif encoder_config is None: - self.encoder_config = ParakeetEncoderConfig() + vocab_size: int = 1025 + ctc_loss_reduction: str = "mean" + ctc_zero_infinity: bool = True + encoder_config: dict | PreTrainedConfig | None = None + pad_token_id: int | None = 1024 - self.encoder_config = self.encoder_config + def __post_init__(self, **kwargs): + if isinstance(self.encoder_config, dict): + self.encoder_config = ParakeetEncoderConfig(**self.encoder_config) + elif self.encoder_config is None: + self.encoder_config = ParakeetEncoderConfig() self.initializer_range = self.encoder_config.initializer_range - self.pad_token_id = pad_token_id - - super().__init__(**kwargs) - - @classmethod - def from_encoder_config(cls, encoder_config: ParakeetEncoderConfig, **kwargs): - r""" - Instantiate a [`ParakeetCTCConfig`] (or a derived class) from parakeet encoder model configuration. - - Returns: - [`ParakeetCTCConfig`]: An instance of a configuration object - """ - - return cls(encoder_config=encoder_config.to_dict(), **kwargs) + super().__post_init__(**kwargs) __all__ = ["ParakeetCTCConfig", "ParakeetEncoderConfig"] diff --git a/src/transformers/models/patchtsmixer/configuration_patchtsmixer.py b/src/transformers/models/patchtsmixer/configuration_patchtsmixer.py index 1a80deea609e..65e8aef5e269 100644 --- a/src/transformers/models/patchtsmixer/configuration_patchtsmixer.py +++ b/src/transformers/models/patchtsmixer/configuration_patchtsmixer.py @@ -13,14 +13,14 @@ # limitations under the License. """PatchTSMixer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="ibm/patchtsmixer-etth1-pretrain") +@strict(accept_kwargs=True) class PatchTSMixerConfig(PreTrainedConfig): r""" context_length (`int`, *optional*, defaults to 32): @@ -122,87 +122,45 @@ class PatchTSMixerConfig(PreTrainedConfig): "num_hidden_layers": "num_layers", } - def __init__( - self, - # Time series specific configuration - context_length: int = 32, - patch_length: int = 8, - num_input_channels: int = 1, - patch_stride: int = 8, - num_parallel_samples: int = 100, - # General model configuration - d_model: int = 8, - expansion_factor: int = 2, - num_layers: int = 3, - dropout: float = 0.2, - mode: str = "common_channel", - gated_attn: bool = True, - norm_mlp: str = "LayerNorm", - self_attn: bool = False, - self_attn_heads: int = 1, - use_positional_encoding: bool = False, - positional_encoding_type: str = "sincos", - scaling: str | bool | None = "std", - loss: str = "mse", - init_std: float = 0.02, - norm_eps: float = 1e-5, - # Pretrain model configuration - mask_type: str = "random", - random_mask_ratio: float = 0.5, - num_forecast_mask_patches: list[int] | int | None = [2], - mask_value: int = 0, - masked_loss: bool = True, - channel_consistent_masking: bool = True, - unmasked_channel_indices: list[int] | None = None, - # General head configuration - head_dropout: float = 0.2, - distribution_output: str = "student_t", - # Prediction head configuration - prediction_length: int = 16, - prediction_channel_indices: list | None = None, - # Classification/Regression configuration - num_targets: int = 3, - output_range: list | None = None, - head_aggregation: str = "max_pool", - **kwargs, - ): - self.num_input_channels = num_input_channels - self.context_length = context_length - self.patch_length = patch_length - self.patch_stride = patch_stride - self.d_model = d_model - self.expansion_factor = expansion_factor - self.num_layers = num_layers - self.dropout = dropout - self.mode = mode - self.gated_attn = gated_attn - self.norm_mlp = norm_mlp - self.scaling = scaling - self.head_dropout = head_dropout - self.num_patches = (max(context_length, patch_length) - patch_length) // patch_stride + 1 - self.mask_type = mask_type - self.random_mask_ratio = random_mask_ratio - self.num_forecast_mask_patches = num_forecast_mask_patches - self.mask_value = mask_value - self.channel_consistent_masking = channel_consistent_masking - self.masked_loss = masked_loss + context_length: int = 32 + patch_length: int = 8 + num_input_channels: int = 1 + patch_stride: int = 8 + num_parallel_samples: int = 100 + d_model: int = 8 + expansion_factor: int = 2 + num_layers: int = 3 + dropout: float | int = 0.2 + mode: str = "common_channel" + gated_attn: bool = True + norm_mlp: str = "LayerNorm" + self_attn: bool = False + self_attn_heads: int = 1 + use_positional_encoding: bool = False + positional_encoding_type: str = "sincos" + scaling: str | bool | None = "std" + loss: str = "mse" + init_std: float = 0.02 + norm_eps: float = 1e-5 + mask_type: str = "random" + random_mask_ratio: float = 0.5 + num_forecast_mask_patches: list[int] | tuple[int, ...] | int | None = (2,) + mask_value: int = 0 + masked_loss: bool = True + channel_consistent_masking: bool = True + unmasked_channel_indices: list[int] | None = None + head_dropout: float | int = 0.2 + distribution_output: str = "student_t" + prediction_length: int = 16 + prediction_channel_indices: list | None = None + num_targets: int = 3 + output_range: list | None = None + head_aggregation: str | None = "max_pool" + + def __post_init__(self, **kwargs): + self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.patch_stride + 1 self.patch_last = True - self.use_positional_encoding = use_positional_encoding - self.positional_encoding_type = positional_encoding_type - self.prediction_length = prediction_length - self.prediction_channel_indices = prediction_channel_indices - self.num_targets = num_targets - self.output_range = output_range - self.head_aggregation = head_aggregation - self.self_attn = self_attn - self.self_attn_heads = self_attn_heads - self.init_std = init_std - self.distribution_output = distribution_output - self.loss = loss - self.num_parallel_samples = num_parallel_samples - self.unmasked_channel_indices = unmasked_channel_indices - self.norm_eps = norm_eps - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["PatchTSMixerConfig"] diff --git a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py index 4ceaaa1f6647..fd98d11907b2 100644 --- a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py +++ b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py @@ -1123,7 +1123,7 @@ class PatchTSMixerEncoder(PatchTSMixerPreTrainedModel): def __init__(self, config: PatchTSMixerConfig): super().__init__(config) - self.use_return_dict = config.use_return_dict + self.return_dict = config.return_dict self.patcher = nn.Linear(config.patch_length, config.d_model) if config.use_positional_encoding: @@ -1157,7 +1157,7 @@ def forward( `torch.FloatTensor` of shape `(batch_size, n_vars, num_patches, d_model)` """ - return_dict = return_dict if return_dict is not None else self.use_return_dict + return_dict = return_dict if return_dict is not None else self.return_dict # flatten [bs x num_patch x d_model]. common_channel/mix_channel: [bs x n_vars x num_patch x d_model] patches = self.patcher(past_values) @@ -1225,7 +1225,7 @@ def __init__(self, config: PatchTSMixerConfig, mask_input: bool = False): """ super().__init__(config) - self.use_return_dict = config.use_return_dict + self.return_dict = config.return_dict self.encoder = PatchTSMixerEncoder(config) self.patching = PatchTSMixerPatchify(config) @@ -1267,7 +1267,7 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). """ - return_dict = return_dict if return_dict is not None else self.use_return_dict + return_dict = return_dict if return_dict is not None else self.return_dict mask = None if observed_mask is None: @@ -1349,7 +1349,7 @@ def __init__(self, config: PatchTSMixerConfig): self.model = PatchTSMixerModel(config, mask_input=True) self.head = PatchTSMixerPretrainHead(config=config) self.masked_loss = config.masked_loss - self.use_return_dict = config.use_return_dict + self.return_dict = config.return_dict # Initialize weights and apply final processing self.post_init() @@ -1380,7 +1380,7 @@ def forward( return_loss (`bool`, *optional*): Whether to return the loss in the `forward` call. """ - return_dict = return_dict if return_dict is not None else self.use_return_dict + return_dict = return_dict if return_dict is not None else self.return_dict if self.masked_loss is True: loss = torch.nn.MSELoss(reduction="none") @@ -1537,7 +1537,7 @@ class PatchTSMixerForPrediction(PatchTSMixerPreTrainedModel): def __init__(self, config: PatchTSMixerConfig): super().__init__(config) self.loss = config.loss - self.use_return_dict = config.use_return_dict + self.return_dict = config.return_dict self.prediction_channel_indices = config.prediction_channel_indices self.num_parallel_samples = config.num_parallel_samples @@ -1609,7 +1609,7 @@ def forward( else: raise ValueError("Invalid loss function: Allowed values: mse and nll") - return_dict = return_dict if return_dict is not None else self.use_return_dict + return_dict = return_dict if return_dict is not None else self.return_dict # past_values: tensor [batch_size x context_length x num_input_channels] model_output = self.model( @@ -1780,7 +1780,7 @@ def __init__(self, config: PatchTSMixerConfig): self.head = PatchTSMixerLinearHead( config=config, ) - self.use_return_dict = config.use_return_dict + self.return_dict = config.return_dict if config.scaling in ["std", "mean", True]: self.inject_scale = InjectScalerStatistics4D(d_model=config.d_model, num_patches=config.num_patches) else: @@ -1828,7 +1828,7 @@ def forward( loss = torch.nn.CrossEntropyLoss() - return_dict = return_dict if return_dict is not None else self.use_return_dict + return_dict = return_dict if return_dict is not None else self.return_dict model_output = self.model( past_values, @@ -1949,7 +1949,7 @@ def __init__(self, config: PatchTSMixerConfig): self.loss = config.loss self.distribution_output = config.distribution_output - self.use_return_dict = config.use_return_dict + self.return_dict = config.return_dict self.num_parallel_samples = config.num_parallel_samples if config.loss == "mse": @@ -2022,7 +2022,7 @@ def forward( else: raise ValueError("Invalid loss function: Allowed values: mse and nll") - return_dict = return_dict if return_dict is not None else self.use_return_dict + return_dict = return_dict if return_dict is not None else self.return_dict model_output = self.model( past_values, output_hidden_states=output_hidden_states, diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index a07da696e739..9f8740ef5cb0 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -13,14 +13,14 @@ # limitations under the License. """PatchTST model configuration""" -from transformers.configuration_utils import PreTrainedConfig -from transformers.utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from transformers.configuration_utils import PreTrainedConfig +from transformers.utils import auto_docstring @auto_docstring(checkpoint="ibm-granite/granite-timeseries-patchtst") +@strict(accept_kwargs=True) class PatchTSTConfig(PreTrainedConfig): r""" context_length (`int`, *optional*, defaults to 32): @@ -105,7 +105,6 @@ class PatchTSTConfig(PreTrainedConfig): num_parallel_samples (`int`, *optional*, defaults to 100): The number of samples is generated in parallel for probabilistic prediction. - ```python >>> from transformers import PatchTSTConfig, PatchTSTModel @@ -126,112 +125,45 @@ class PatchTSTConfig(PreTrainedConfig): "num_hidden_layers": "num_hidden_layers", } - def __init__( - self, - # time series specific configuration - num_input_channels: int = 1, - context_length: int = 32, - distribution_output: str = "student_t", - loss: str = "mse", - # PatchTST arguments - patch_length: int = 1, - patch_stride: int = 1, - # Transformer architecture configuration - num_hidden_layers: int = 3, - d_model: int = 128, - num_attention_heads: int = 4, - share_embedding: bool = True, - channel_attention: bool = False, - ffn_dim: int = 512, - norm_type: str = "batchnorm", - norm_eps: float = 1e-05, - attention_dropout: float = 0.0, - positional_dropout: float = 0.0, - path_dropout: float = 0.0, - ff_dropout: float = 0.0, - bias: bool = True, - activation_function: str = "gelu", - pre_norm: bool = True, - positional_encoding_type: str = "sincos", - use_cls_token: bool = False, - init_std: float = 0.02, - share_projection: bool = True, - scaling: str | bool | None = "std", - # mask pretraining - do_mask_input: bool | None = None, - mask_type: str = "random", - random_mask_ratio: float = 0.5, - num_forecast_mask_patches: list[int] | int | None = [2], - channel_consistent_masking: bool | None = False, - unmasked_channel_indices: list[int] | None = None, - mask_value: int = 0, - # head - pooling_type: str = "mean", - head_dropout: float = 0.0, - prediction_length: int = 24, - num_targets: int = 1, - output_range: list | None = None, - # distribution head - num_parallel_samples: int = 100, - **kwargs, - ): - # time series specific configuration - self.context_length = context_length - self.num_input_channels = num_input_channels # n_vars - self.loss = loss - self.distribution_output = distribution_output - self.num_parallel_samples = num_parallel_samples - - # Transformer architecture configuration - self.d_model = d_model - self.num_attention_heads = num_attention_heads - self.ffn_dim = ffn_dim - self.num_hidden_layers = num_hidden_layers - self.attention_dropout = attention_dropout - self.share_embedding = share_embedding - self.channel_attention = channel_attention - self.norm_type = norm_type - self.norm_eps = norm_eps - self.positional_dropout = positional_dropout - self.path_dropout = path_dropout - self.ff_dropout = ff_dropout - self.bias = bias - self.activation_function = activation_function - self.pre_norm = pre_norm - self.positional_encoding_type = positional_encoding_type - self.use_cls_token = use_cls_token - self.init_std = init_std - self.scaling = scaling - - # PatchTST parameters - self.patch_length = patch_length - self.patch_stride = patch_stride - - # Mask pretraining - self.do_mask_input = do_mask_input - self.mask_type = mask_type - self.random_mask_ratio = random_mask_ratio # for random masking - self.num_forecast_mask_patches = num_forecast_mask_patches # for forecast masking - self.channel_consistent_masking = channel_consistent_masking - self.unmasked_channel_indices = unmasked_channel_indices - self.mask_value = mask_value - - # general head params - self.pooling_type = pooling_type - self.head_dropout = head_dropout - - # For prediction head - self.share_projection = share_projection - self.prediction_length = prediction_length - - # For prediction and regression head - self.num_parallel_samples = num_parallel_samples - - # Regression - self.num_targets = num_targets - self.output_range = output_range - - super().__init__(**kwargs) + num_input_channels: int = 1 + context_length: int = 32 + distribution_output: str = "student_t" + loss: str | None = "mse" + patch_length: int = 1 + patch_stride: int = 1 + num_hidden_layers: int = 3 + d_model: int = 128 + num_attention_heads: int = 4 + share_embedding: bool = True + channel_attention: bool = False + ffn_dim: int = 512 + norm_type: str = "batchnorm" + norm_eps: float = 1e-05 + attention_dropout: float | int = 0.0 + positional_dropout: float | int = 0.0 + path_dropout: float | int = 0.0 + ff_dropout: float | int = 0.0 + bias: bool = True + activation_function: str = "gelu" + pre_norm: bool = True + positional_encoding_type: str = "sincos" + use_cls_token: bool = False + init_std: float = 0.02 + share_projection: bool = True + scaling: str | bool | None = "std" + do_mask_input: bool | None = None + mask_type: str = "random" + random_mask_ratio: float = 0.5 + num_forecast_mask_patches: list[int] | tuple[int, ...] | int | None = (2,) + channel_consistent_masking: bool | None = False + unmasked_channel_indices: list[int] | None = None + mask_value: int = 0 + pooling_type: str | None = "mean" + head_dropout: float | int = 0.0 + prediction_length: int = 24 + num_targets: int = 1 + output_range: list | None = None + num_parallel_samples: int = 100 __all__ = ["PatchTSTConfig"] diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 973074c83633..c669c89a4814 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1150,7 +1150,7 @@ def forward( >>> last_hidden_state = outputs.last_hidden_state ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1303,7 +1303,7 @@ def forward( >>> loss.backward() ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # past_values: [bs x num_channels x num_patches x d_model] or # [bs x num_channels x (num_patches+1) x d_model] if use cls_token @@ -1436,7 +1436,7 @@ def forward( >>> labels = outputs.prediction_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict model_output = self.model( past_values=past_values, @@ -1662,7 +1662,7 @@ def forward( >>> prediction_outputs = outputs.prediction_outputs ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # get model output model_output = self.model( @@ -1884,7 +1884,7 @@ def forward( >>> regression_outputs = outputs.regression_outputs ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict model_output = self.model( past_values=past_values, diff --git a/src/transformers/models/pe_audio/configuration_pe_audio.py b/src/transformers/models/pe_audio/configuration_pe_audio.py index bea137b7564f..58d70638e2b7 100644 --- a/src/transformers/models/pe_audio/configuration_pe_audio.py +++ b/src/transformers/models/pe_audio/configuration_pe_audio.py @@ -12,16 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig, PretrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="facebook/pe-av-large") +@strict(accept_kwargs=True) class PeAudioEncoderConfig(PreTrainedConfig): r""" dac_config (`Union[PreTrainedConfig, dict]`, *optional*): @@ -50,56 +51,41 @@ class PeAudioEncoderConfig(PreTrainedConfig): "encoder_hidden_size": 64, "codebook_dim": 128, } + dac_config: dict | PreTrainedConfig | None = None + hidden_size: int = 1792 + intermediate_size: int = 4800 + num_hidden_layers: int = 6 + num_attention_heads: int = 14 + num_key_value_heads: int | None = None + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 10000 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + if self.rope_parameters is None: + self.rope_parameters = {"rope_theta": 20000, "rope_type": "default"} + + if isinstance(self.dac_config, dict): + self.dac_config["model_type"] = self.dac_config.get("model_type", "dac") + self.dac_config = CONFIG_MAPPING[self.dac_config["model_type"]]( + **{**self._default_dac_config_kwargs, **self.dac_config} + ) + elif self.dac_config is None: + self.dac_config = CONFIG_MAPPING["dac"](**self._default_dac_config_kwargs) - def __init__( - self, - dac_config: dict | PreTrainedConfig | None = None, - hidden_size: int | None = 1792, - intermediate_size: int | None = 4800, - num_hidden_layers: int | None = 6, - num_attention_heads: int | None = 14, - num_key_value_heads: int | None = None, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 10000, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - rope_parameters: RopeParameters | dict | None = {"rope_theta": 20000}, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - **kwargs, - ): - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.head_dim = head_dim - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.rope_parameters = rope_parameters - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - if isinstance(dac_config, dict): - dac_config["model_type"] = dac_config.get("model_type", "dac") - dac_config = CONFIG_MAPPING[dac_config["model_type"]](**{**self._default_dac_config_kwargs, **dac_config}) - elif dac_config is None: - dac_config = CONFIG_MAPPING["dac"](**self._default_dac_config_kwargs) - - self.dac_config = dac_config - - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="facebook/pe-av-large") +@strict(accept_kwargs=True) class PeAudioConfig(PretrainedConfig): r""" Example: @@ -129,29 +115,23 @@ class PeAudioConfig(PretrainedConfig): "num_attention_heads": 16, } - def __init__( - self, - text_config=None, - audio_config=None, - **kwargs, - ): - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "modernbert") - text_config = CONFIG_MAPPING[text_config["model_type"]]( - **{**self._default_text_config_kwargs, **text_config} - ) - elif text_config is None: - text_config = CONFIG_MAPPING["modernbert"](**self._default_text_config_kwargs) - - if isinstance(audio_config, dict): - audio_config = PeAudioEncoderConfig(**audio_config) - elif audio_config is None: - audio_config = PeAudioEncoderConfig() + text_config: dict | PreTrainedConfig | None = None + audio_config: dict | PreTrainedConfig | None = None - self.text_config = text_config - self.audio_config = audio_config - - super().__init__(**kwargs) + def __post_init__(self, **kwargs): + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "modernbert") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]]( + **{**self._default_text_config_kwargs, **self.text_config} + ) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["modernbert"](**self._default_text_config_kwargs) + + if isinstance(self.audio_config, dict): + self.audio_config = PeAudioEncoderConfig(**self.audio_config) + elif self.audio_config is None: + self.audio_config = PeAudioEncoderConfig() + super().__post_init__(**kwargs) __all__ = ["PeAudioEncoderConfig", "PeAudioConfig"] diff --git a/src/transformers/models/pe_audio_video/configuration_pe_audio_video.py b/src/transformers/models/pe_audio_video/configuration_pe_audio_video.py index d8281a549429..adbb1d30b8d7 100644 --- a/src/transformers/models/pe_audio_video/configuration_pe_audio_video.py +++ b/src/transformers/models/pe_audio_video/configuration_pe_audio_video.py @@ -13,16 +13,16 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig, PretrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="facebook/pe-av-large") +@strict(accept_kwargs=True) class PeAudioVideoEncoderConfig(PreTrainedConfig): r""" video_config (`Union[PreTrainedConfig, dict]`, *optional*): @@ -46,63 +46,45 @@ class PeAudioVideoEncoderConfig(PreTrainedConfig): base_config_key = "audio_video_config" sub_configs = {"audio_config": AutoConfig, "video_config": AutoConfig} - def __init__( - self, - audio_config: dict | PreTrainedConfig | None = None, - video_config: dict | PreTrainedConfig | None = None, - hidden_size: int | None = 1792, - intermediate_size: int | None = 4800, - num_hidden_layers: int | None = 6, - num_attention_heads: int | None = 14, - num_key_value_heads: int | None = None, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 10000, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - rope_parameters: RopeParameters | dict | None = {"rope_theta": 20000}, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - **kwargs, - ): - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.head_dim = head_dim - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.rope_parameters = rope_parameters - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - if isinstance(audio_config, dict): - audio_config["model_type"] = audio_config.get("model_type", "pe_audio_encoder") - audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config) - elif audio_config is None: - audio_config = CONFIG_MAPPING["pe_audio_encoder"]() - - if isinstance(video_config, dict): - video_config["model_type"] = video_config.get("model_type", "pe_video_encoder") - video_config = CONFIG_MAPPING[video_config["model_type"]](**video_config) - elif video_config is None: - video_config = CONFIG_MAPPING["pe_video_encoder"]() - - self.audio_config = audio_config - self.video_config = video_config - - super().__init__(**kwargs) + audio_config: dict | PreTrainedConfig | None = None + video_config: dict | PreTrainedConfig | None = None + hidden_size: int = 1792 + intermediate_size: int = 4800 + num_hidden_layers: int = 6 + num_attention_heads: int = 14 + num_key_value_heads: int | None = None + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 10000 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + if isinstance(self.audio_config, dict): + self.audio_config["model_type"] = self.audio_config.get("model_type", "pe_audio_encoder") + self.audio_config = CONFIG_MAPPING[self.audio_config["model_type"]](**self.audio_config) + elif self.audio_config is None: + self.audio_config = CONFIG_MAPPING["pe_audio_encoder"]() + + if isinstance(self.video_config, dict): + self.video_config["model_type"] = self.video_config.get("model_type", "pe_video_encoder") + self.video_config = CONFIG_MAPPING[self.video_config["model_type"]](**self.video_config) + elif self.video_config is None: + self.video_config = CONFIG_MAPPING["pe_video_encoder"]() + + if self.rope_parameters is None: + self.rope_parameters = {"rope_theta": 20000} + super().__post_init__(**kwargs) @auto_docstring(checkpoint="facebook/pe-av-large") +@strict(accept_kwargs=True) class PeAudioVideoConfig(PretrainedConfig): r""" audio_video_config (`dict` or `PreTrainedConfig`, *optional*): @@ -132,29 +114,24 @@ class PeAudioVideoConfig(PretrainedConfig): "num_attention_heads": 16, } - def __init__( - self, - text_config=None, - audio_video_config=None, - **kwargs, - ): - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "modernbert") - text_config = CONFIG_MAPPING[text_config["model_type"]]( - **{**self._default_text_config_kwargs, **text_config} - ) - elif text_config is None: - text_config = CONFIG_MAPPING["modernbert"](**self._default_text_config_kwargs) + text_config: dict | PreTrainedConfig | None = None + audio_video_config: dict | PreTrainedConfig | None = None - if isinstance(audio_video_config, dict): - audio_video_config = PeAudioVideoEncoderConfig(**audio_video_config) - elif audio_video_config is None: - audio_video_config = PeAudioVideoEncoderConfig() + def __post_init__(self, **kwargs): + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "modernbert") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]]( + **{**self._default_text_config_kwargs, **self.text_config} + ) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["modernbert"](**self._default_text_config_kwargs) - self.text_config = text_config - self.audio_video_config = audio_video_config + if isinstance(self.audio_video_config, dict): + self.audio_video_config = PeAudioVideoEncoderConfig(**self.audio_video_config) + elif self.audio_video_config is None: + self.audio_video_config = PeAudioVideoEncoderConfig() - super().__init__(**kwargs) + super().__post_init__(**kwargs) @property def audio_config(self): diff --git a/src/transformers/models/pe_video/configuration_pe_video.py b/src/transformers/models/pe_video/configuration_pe_video.py index cc934cb06181..999f7c3fd35e 100644 --- a/src/transformers/models/pe_video/configuration_pe_video.py +++ b/src/transformers/models/pe_video/configuration_pe_video.py @@ -12,17 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig, PretrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig from ..timm_wrapper import TimmWrapperConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="facebook/pe-av-large") +@strict(accept_kwargs=True) class PeVideoEncoderConfig(PreTrainedConfig): r""" Example: @@ -52,57 +53,41 @@ class PeVideoEncoderConfig(PreTrainedConfig): "initializer_range": 0.02, } - def __init__( - self, - vision_config: dict | PreTrainedConfig | None = None, - hidden_size: int | None = 1792, - intermediate_size: int | None = 4800, - num_hidden_layers: int | None = 6, - num_attention_heads: int | None = 14, - num_key_value_heads: int | None = None, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 10000, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-5, - rope_parameters: RopeParameters | dict | None = {"rope_theta": 20000}, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - **kwargs, - ): - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.head_dim = head_dim - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.rope_parameters = rope_parameters - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "timm_wrapper") - vision_config = CONFIG_MAPPING[vision_config["model_type"]].from_dict( - {**self._default_vision_config_kwargs, **vision_config} + vision_config: dict | PreTrainedConfig | None = None + hidden_size: int = 1792 + intermediate_size: int = 4800 + num_hidden_layers: int = 6 + num_attention_heads: int = 14 + num_key_value_heads: int | None = None + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 10000 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + if self.rope_parameters is None: + self.rope_parameters = {"rope_theta": 20000} + + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "timm_wrapper") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]].from_dict( + {**self._default_vision_config_kwargs, **self.vision_config} ) - elif vision_config is None: - vision_config = CONFIG_MAPPING["timm_wrapper"].from_dict(self._default_vision_config_kwargs) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["timm_wrapper"].from_dict(self._default_vision_config_kwargs) - self.vision_config = vision_config - - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="facebook/pe-av-large") +@strict(accept_kwargs=True) class PeVideoConfig(PretrainedConfig): r""" video_config (`dict` or `PreTrainedConfig`, *optional*): @@ -133,29 +118,24 @@ class PeVideoConfig(PretrainedConfig): "num_attention_heads": 16, } - def __init__( - self, - text_config=None, - video_config=None, - **kwargs, - ): - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "modernbert") - text_config = CONFIG_MAPPING[text_config["model_type"]]( - **{**self._default_text_config_kwargs, **text_config} - ) - elif text_config is None: - text_config = CONFIG_MAPPING["modernbert"](**self._default_text_config_kwargs) + text_config: dict | PreTrainedConfig | None = None + video_config: dict | PreTrainedConfig | None = None - if isinstance(video_config, dict): - video_config = PeVideoEncoderConfig(**video_config) - elif video_config is None: - video_config = PeVideoEncoderConfig() + def __post_init__(self, **kwargs): + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "modernbert") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]]( + **{**self._default_text_config_kwargs, **self.text_config} + ) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["modernbert"](**self._default_text_config_kwargs) - self.text_config = text_config - self.video_config = video_config + if isinstance(self.video_config, dict): + self.video_config = PeVideoEncoderConfig(**self.video_config) + elif self.video_config is None: + self.video_config = PeVideoEncoderConfig() - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["PeVideoEncoderConfig", "PeVideoConfig"] diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py index cac2361c6c6a..88c1529df6a0 100644 --- a/src/transformers/models/pegasus/configuration_pegasus.py +++ b/src/transformers/models/pegasus/configuration_pegasus.py @@ -13,14 +13,14 @@ # limitations under the License. """PEGASUS model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/pegasus-large") +@strict(accept_kwargs=True) class PegasusConfig(PreTrainedConfig): r""" Example: @@ -40,66 +40,37 @@ class PegasusConfig(PreTrainedConfig): model_type = "pegasus" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", + } - def __init__( - self, - vocab_size=50265, - max_position_embeddings=1024, - encoder_layers=12, - encoder_ffn_dim=4096, - encoder_attention_heads=16, - decoder_layers=12, - decoder_ffn_dim=4096, - decoder_attention_heads=16, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - use_cache=True, - is_encoder_decoder=True, - activation_function="gelu", - d_model=1024, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - decoder_start_token_id=0, - scale_embedding=False, - pad_token_id=0, - eos_token_id=1, - forced_eos_token_id=1, - is_decoder=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.forced_eos_token_id = forced_eos_token_id - super().__init__( - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) + vocab_size: int = 50265 + max_position_embeddings: int = 1024 + encoder_layers: int = 12 + encoder_ffn_dim: int = 4096 + encoder_attention_heads: int = 16 + decoder_layers: int = 12 + decoder_ffn_dim: int = 4096 + decoder_attention_heads: int = 16 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "gelu" + d_model: int = 1024 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + decoder_start_token_id: int | None = 0 + scale_embedding: bool = False + pad_token_id: int | None = 0 + eos_token_id: int | None = 1 + forced_eos_token_id: int | None = 1 + is_decoder: bool = False + tie_word_embeddings: bool = True __all__ = ["PegasusConfig"] diff --git a/src/transformers/models/pegasus_x/configuration_pegasus_x.py b/src/transformers/models/pegasus_x/configuration_pegasus_x.py index a2e28e63c4c3..7b056bb4658b 100644 --- a/src/transformers/models/pegasus_x/configuration_pegasus_x.py +++ b/src/transformers/models/pegasus_x/configuration_pegasus_x.py @@ -13,14 +13,14 @@ # limitations under the License. """PEGASUS-X model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/pegasus-x-large") +@strict(accept_kwargs=True) class PegasusXConfig(PreTrainedConfig): r""" num_global_tokens (`int`, *optional*, defaults to 128): @@ -48,72 +48,39 @@ class PegasusXConfig(PreTrainedConfig): model_type = "pegasus_x" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} - - def __init__( - self, - vocab_size=96103, - max_position_embeddings=16384, - encoder_layers=16, - encoder_ffn_dim=4096, - encoder_attention_heads=16, - decoder_layers=16, - decoder_ffn_dim=4096, - decoder_attention_heads=16, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - use_cache=True, - is_encoder_decoder=True, - activation_function="gelu", - d_model=1024, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - decoder_start_token_id=0, - scale_embedding=True, - pad_token_id=0, - eos_token_id=1, - forced_eos_token_id=1, - num_global_tokens=32, - block_size=512, - stagger_local_blocks=True, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - - self.num_global_tokens = num_global_tokens - self.block_size = block_size - self.stagger_local_blocks = stagger_local_blocks - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__( - is_encoder_decoder=is_encoder_decoder, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", + } + + vocab_size: int = 96103 + max_position_embeddings: int = 16384 + encoder_layers: int = 16 + encoder_ffn_dim: int = 4096 + encoder_attention_heads: int = 16 + decoder_layers: int = 16 + decoder_ffn_dim: int = 4096 + decoder_attention_heads: int = 16 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "gelu" + d_model: int = 1024 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + decoder_start_token_id: int | None = 0 + scale_embedding: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | None = 1 + forced_eos_token_id: int | None = 1 + num_global_tokens: int = 32 + block_size: int = 512 + stagger_local_blocks: bool = True + tie_word_embeddings: bool = True __all__ = ["PegasusXConfig"] diff --git a/src/transformers/models/perceiver/configuration_perceiver.py b/src/transformers/models/perceiver/configuration_perceiver.py index 15c9d5eee639..0f192742af5e 100644 --- a/src/transformers/models/perceiver/configuration_perceiver.py +++ b/src/transformers/models/perceiver/configuration_perceiver.py @@ -13,14 +13,14 @@ # limitations under the License. """Perceiver model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="deepmind/language-perceiver") +@strict(accept_kwargs=True) class PerceiverConfig(PreTrainedConfig): r""" num_latents (`int`, *optional*, defaults to 256): @@ -84,70 +84,33 @@ class PerceiverConfig(PreTrainedConfig): model_type = "perceiver" - def __init__( - self, - num_latents=256, - d_latents=1280, - d_model=768, - num_blocks=1, - num_self_attends_per_block=26, - num_self_attention_heads=8, - num_cross_attention_heads=8, - qk_channels=None, - v_channels=None, - cross_attention_shape_for_attention="kv", - self_attention_widening_factor=1, - cross_attention_widening_factor=1, - hidden_act="gelu", - attention_probs_dropout_prob=0.1, - initializer_range=0.02, - layer_norm_eps=1e-12, - use_query_residual=True, - vocab_size=262, - max_position_embeddings=2048, - image_size=56, - train_size=[368, 496], - num_frames=16, - audio_samples_per_frame=1920, - samples_per_patch=16, - output_shape=[1, 16, 224, 224], - output_num_channels=512, - _label_trainable_num_channels=1024, - **kwargs, - ): - super().__init__(**kwargs) - - self.num_latents = num_latents - self.d_latents = d_latents - self.d_model = d_model - self.num_blocks = num_blocks - self.num_self_attends_per_block = num_self_attends_per_block - self.num_self_attention_heads = num_self_attention_heads - self.num_cross_attention_heads = num_cross_attention_heads - self.qk_channels = qk_channels - self.v_channels = v_channels - self.cross_attention_shape_for_attention = cross_attention_shape_for_attention - self.self_attention_widening_factor = self_attention_widening_factor - self.cross_attention_widening_factor = cross_attention_widening_factor - self.hidden_act = hidden_act - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_query_residual = use_query_residual - # masked language modeling attributes - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - # image classification attributes - self.image_size = image_size - # flow attributes - self.train_size = train_size - # multimodal autoencoding attributes - self.num_frames = num_frames - self.audio_samples_per_frame = audio_samples_per_frame - self.samples_per_patch = samples_per_patch - self.output_shape = output_shape - self.output_num_channels = output_num_channels - self._label_trainable_num_channels = _label_trainable_num_channels + num_latents: int = 256 + d_latents: int = 1280 + d_model: int = 768 + num_blocks: int = 1 + num_self_attends_per_block: int = 26 + num_self_attention_heads: int = 8 + num_cross_attention_heads: int = 8 + qk_channels: int | None = None + v_channels: int | None = None + cross_attention_shape_for_attention: str = "kv" + self_attention_widening_factor: int = 1 + cross_attention_widening_factor: int = 1 + hidden_act: str = "gelu" + attention_probs_dropout_prob: float = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + use_query_residual: bool = True + vocab_size: int = 262 + max_position_embeddings: int = 2048 + image_size: int | list[int] | tuple[int, int] = 56 + train_size: list[int] | tuple[int, ...] = (368, 496) + num_frames: int = 16 + audio_samples_per_frame: int = 1920 + samples_per_patch: int = 16 + output_shape: list[int] | tuple[int, ...] = (1, 16, 224, 224) + output_num_channels: int = 512 + _label_trainable_num_channels: int = 1024 __all__ = ["PerceiverConfig"] diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py index 02f67aca7908..531d5c364805 100755 --- a/src/transformers/models/perceiver/modeling_perceiver.py +++ b/src/transformers/models/perceiver/modeling_perceiver.py @@ -724,7 +724,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if self.input_preprocessor is not None: inputs, modality_sizes, inputs_without_pos = self.input_preprocessor( @@ -914,7 +914,7 @@ def forward( elif inputs is None and input_ids is not None: inputs = input_ids - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.perceiver( inputs=inputs, @@ -1012,7 +1012,7 @@ def forward( elif inputs is None and input_ids is not None: inputs = input_ids - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.perceiver( inputs=inputs, @@ -1156,7 +1156,7 @@ def forward( elif inputs is None and pixel_values is not None: inputs = pixel_values - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.perceiver( inputs=inputs, @@ -1280,7 +1280,7 @@ def forward( raise ValueError("You cannot use both `inputs` and `pixel_values`") elif inputs is None and pixel_values is not None: inputs = pixel_values - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.perceiver( inputs=inputs, @@ -1404,7 +1404,7 @@ def forward( raise ValueError("You cannot use both `inputs` and `pixel_values`") elif inputs is None and pixel_values is not None: inputs = pixel_values - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.perceiver( inputs=inputs, @@ -1530,7 +1530,7 @@ def forward( >>> list(logits.shape) [1, 368, 496, 2] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict loss = None if labels is not None: @@ -1764,7 +1764,7 @@ def forward( >>> list(logits["label"].shape) [1, 700] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict loss = None if labels is not None: diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py index 5ea15d37e55e..cc4488abb633 100644 --- a/src/transformers/models/perception_lm/configuration_perception_lm.py +++ b/src/transformers/models/perception_lm/configuration_perception_lm.py @@ -12,16 +12,16 @@ # limitations under the License. """PerceptionLM model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig from ..timm_wrapper.configuration_timm_wrapper import TimmWrapperConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="facebook/Perception-LM-1B") +@strict(accept_kwargs=True) class PerceptionLMConfig(PreTrainedConfig): r""" vision_use_cls_token (`bool`, *optional*, defaults to `True`): @@ -33,36 +33,28 @@ class PerceptionLMConfig(PreTrainedConfig): model_type = "perception_lm" sub_configs = {"text_config": AutoConfig, "vision_config": TimmWrapperConfig} - def __init__( - self, - vision_config=None, - text_config=None, - vision_use_cls_token=True, - projector_pooling_ratio=1, - image_token_id=128002, - video_token_id=128003, - **kwargs, - ): - self.image_token_id = image_token_id - self.video_token_id = video_token_id - if isinstance(vision_config, dict): - vision_config = TimmWrapperConfig(**vision_config) - elif isinstance(vision_config, TimmWrapperConfig): + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + vision_use_cls_token: bool = True + projector_pooling_ratio: int = 1 + image_token_id: int = 128002 + video_token_id: int = 128003 + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = TimmWrapperConfig(**self.vision_config) + elif isinstance(self.vision_config, TimmWrapperConfig): pass - elif vision_config is None: - vision_config = TimmWrapperConfig() - self.vision_config = vision_config - self.vision_use_cls_token = vision_use_cls_token + elif self.vision_config is None: + self.vision_config = TimmWrapperConfig() - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["llama"]() + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["llama"]() - self.text_config = text_config - self.projector_pooling_ratio = projector_pooling_ratio - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["PerceptionLMConfig"] diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index 35f09b14472e..d562ffc35b11 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -13,15 +13,15 @@ # limitations under the License. """Persimmon model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="adept/persimmon-8b-base") +@strict(accept_kwargs=True) class PersimmonConfig(PreTrainedConfig): r""". qk_layernorm (`bool`, *optional*, default to `True`): @@ -39,49 +39,28 @@ class PersimmonConfig(PreTrainedConfig): model_type = "persimmon" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 262144, - hidden_size: int | None = 4096, - intermediate_size: int | None = 16384, - num_hidden_layers: int | None = 36, - num_attention_heads: int | None = 64, - hidden_act: str | None = "relu2", - max_position_embeddings: int | None = 16384, - initializer_range: float | None = 0.02, - layer_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - qk_layernorm: bool | None = True, - hidden_dropout: float | None = 0.0, - attention_dropout: float | None = 0.0, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.qk_layernorm = qk_layernorm - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC + vocab_size: int = 262144 + hidden_size: int = 4096 + intermediate_size: int = 16384 + num_hidden_layers: int = 36 + num_attention_heads: int = 64 + hidden_act: str = "relu2" + max_position_embeddings: int = 16384 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + qk_layernorm: bool = True + hidden_dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + def __post_init__(self, **kwargs): + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC + super().__post_init__(**kwargs) __all__ = ["PersimmonConfig"] diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index 16e75cd96419..46eaff04648d 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -14,15 +14,15 @@ """Phi model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/phi-1") +@strict(accept_kwargs=True) class PhiConfig(PreTrainedConfig): r""" qk_layernorm (`bool`, *optional*, defaults to `False`): @@ -60,57 +60,33 @@ class PhiConfig(PreTrainedConfig): "final_layernorm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 51200, - hidden_size: int | None = 2048, - intermediate_size: int | None = 8192, - num_hidden_layers: int | None = 24, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - resid_pdrop: float | None = 0.0, - embd_pdrop: float | None = 0.0, - attention_dropout: float | None = 0.0, - hidden_act: str | None = "gelu_new", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - layer_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - qk_layernorm: bool | None = False, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attention_dropout = attention_dropout - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.qk_layernorm = qk_layernorm - self.rope_parameters = rope_parameters - kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC + vocab_size: int = 51200 + hidden_size: int = 2048 + intermediate_size: int = 8192 + num_hidden_layers: int = 24 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + resid_pdrop: float = 0.0 + embd_pdrop: float = 0.0 + attention_dropout: float | int | None = 0.0 + hidden_act: str = "gelu_new" + max_position_embeddings: int = 2048 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + qk_layernorm: bool = False + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC + super().__post_init__(**kwargs) __all__ = ["PhiConfig"] diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 328915c1cdcf..b4fa5fdefe04 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -14,15 +14,15 @@ """Phi-3 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/Phi-3-mini-4k-instruct") +@strict(accept_kwargs=True) class Phi3Config(PreTrainedConfig): r""" original_max_position_embeddings (`int`, *optional*, defaults to 4096): @@ -58,59 +58,33 @@ class Phi3Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32064, - hidden_size: int | None = 3072, - intermediate_size: int | None = 8192, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - resid_pdrop: float | None = 0.0, - embd_pdrop: float | None = 0.0, - attention_dropout: float | None = 0.0, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096, - original_max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 32000, - pad_token_id: int | None = 32000, - sliding_window: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attention_dropout = attention_dropout - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.original_max_position_embeddings = original_max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_parameters = rope_parameters - kwargs.setdefault("partial_rotary_factor", 1.0) # assign default for BC - self.sliding_window = sliding_window - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vocab_size: int = 32064 + hidden_size: int = 3072 + intermediate_size: int = 8192 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + resid_pdrop: float = 0.0 + embd_pdrop: float = 0.0 + attention_dropout: float | int = 0.0 + hidden_act: str = "silu" + max_position_embeddings: int = 4096 + original_max_position_embeddings: int = 4096 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 32000 + pad_token_id: int | None = 32000 + sliding_window: int | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) def convert_rope_params_to_dict( self, default_theta: int | float = 10_000.0, ignore_keys: set | None = None, **kwargs @@ -121,21 +95,20 @@ def convert_rope_params_to_dict( # Standardize and validate the correctness of rotary position embeddings parameters self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) - self.rope_parameters.setdefault("partial_rotary_factor", kwargs["partial_rotary_factor"]) + self.rope_parameters.setdefault("partial_rotary_factor", kwargs.get("partial_rotary_factor", 1.0)) self.standardize_rope_params() # For backward compatibility if previous version used "su" or "yarn" rope_parameters_type = self.rope_parameters.get("rope_type", None) if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: self.rope_parameters["rope_type"] = "longrope" - self.validate_rope(ignore_keys=ignore_keys) return kwargs - def validate_rope(self, ignore_keys: set | None = None): + def validate_rope(self): """ Validate the `rope_parameters` configuration. """ - super().validate_rope(ignore_keys=ignore_keys) + super().validate_rope() # Run Phi3 specific validation if not isinstance(self.rope_parameters, dict): diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index fadf07d434ee..373ea67ac297 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -20,12 +20,15 @@ import math +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/Phi-4-multimodal-instruct") +@strict(accept_kwargs=True) class Phi4MultimodalVisionConfig(PreTrainedConfig): r""" crop_size (`int`, *optional*, defaults to 448): @@ -45,41 +48,23 @@ class Phi4MultimodalVisionConfig(PreTrainedConfig): model_type = "phi4_multimodal_vision" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1152, - intermediate_size=4304, - num_hidden_layers=27, - num_attention_heads=16, - num_channels=3, - image_size=448, - patch_size=14, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - crop_size: int = 448, - image_token_id: int = 200010, - feature_layer: int = -2, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.crop_size = crop_size - self.image_token_id = image_token_id - self.feature_layer = feature_layer + hidden_size: int = 1152 + intermediate_size: int = 4304 + num_hidden_layers: int = 27 + num_attention_heads: int = 16 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 448 + patch_size: int | list[int] | tuple[int, int] = 14 + hidden_act: str = "gelu_pytorch_tanh" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + crop_size: int = 448 + image_token_id: int = 200010 + feature_layer: int = -2 @auto_docstring(checkpoint="microsoft/Phi-4-multimodal-instruct") +@strict(accept_kwargs=True) class Phi4MultimodalAudioConfig(PreTrainedConfig): r""" num_blocks (`int`, *optional*, defaults to 24): @@ -132,69 +117,46 @@ class Phi4MultimodalAudioConfig(PreTrainedConfig): model_type = "phi4_multimodal_audio" - def __init__( - self, - hidden_size: int = 1024, - intermediate_size: int = 1536, - num_blocks: int = 24, - num_attention_heads: int = 16, - activation: str = "swish", - chunk_size: int = -1, - left_chunk: int = 18, - dropout_rate: float = 0.0, - ext_pw_out_channel: int = 1024, - depthwise_separable_out_channel: int = 1024, - depthwise_multiplier: int = 1, - kernel_size: int = 3, - conv_activation: str = "swish", - input_size: int = 80, - conv_glu_type: str = "swish", - time_reduction: int = 8, - bias_max_distance: int = 1000, - bias_symmetric: bool = False, - nemo_activation: str = "relu", - nemo_conv_channels: int = 1024, - downsample_rate: int = 1, - initializer_range: float = 0.02, - audio_token_id: int = 200011, - feature_layer: int = -2, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.activation = activation - self.chunk_size = chunk_size - self.left_chunk = left_chunk - self.num_blocks = num_blocks - self.dropout_rate = dropout_rate - self.ext_pw_out_channel = ext_pw_out_channel - self.depthwise_separable_out_channel = depthwise_separable_out_channel - self.depthwise_multiplier = depthwise_multiplier - self.kernel_size = kernel_size - self.conv_activation = conv_activation - self.input_size = input_size - self.conv_glu_type = conv_glu_type - self.time_reduction = time_reduction - self.bias_max_distance = bias_max_distance - self.bias_symmetric = bias_symmetric - self.nemo_activation = nemo_activation - self.nemo_conv_channels = nemo_conv_channels - self.downsample_rate = downsample_rate - self.audio_token_id = audio_token_id - self.initializer_range = initializer_range - self.feature_layer = feature_layer - - if time_reduction % 2 != 0: + hidden_size: int = 1024 + intermediate_size: int = 1536 + num_blocks: int = 24 + num_attention_heads: int = 16 + activation: str = "swish" + chunk_size: int = -1 + left_chunk: int = 18 + dropout_rate: float = 0.0 + ext_pw_out_channel: int = 1024 + depthwise_separable_out_channel: int = 1024 + depthwise_multiplier: int = 1 + kernel_size: int = 3 + conv_activation: str = "swish" + input_size: int = 80 + conv_glu_type: str = "swish" + time_reduction: int = 8 + bias_max_distance: int = 1000 + bias_symmetric: bool = False + nemo_activation: str = "relu" + nemo_conv_channels: int = 1024 + downsample_rate: int = 1 + initializer_range: float = 0.02 + audio_token_id: int = 200011 + feature_layer: int = -2 + + def __post_init__(self, **kwargs): + nemo_final_size = self.input_size + for _ in range(int(math.log2(self.time_reduction))): + nemo_final_size = math.floor((nemo_final_size - 1) / 2 + 1) + self.nemo_final_size = nemo_final_size + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.time_reduction % 2 != 0: raise ValueError("`time_reduction` should be a multiple of 2!") - length = input_size - for _ in range(int(math.log2(time_reduction))): - length = math.floor((length - 1) / 2 + 1) - self.nemo_final_size = length @auto_docstring(checkpoint="microsoft/Phi-4-multimodal-instruct") +@strict(accept_kwargs=True) class Phi4MultimodalConfig(PreTrainedConfig): r""" original_max_position_embeddings (`int`, *optional*, defaults to 4096): @@ -230,74 +192,47 @@ class Phi4MultimodalConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } + vocab_size: int = 200064 + hidden_size: int = 3072 + intermediate_size: int = 8192 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 8 + resid_pdrop: float = 0.0 + embd_pdrop: float = 0.0 + attention_dropout: float | int = 0.0 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + original_max_position_embeddings: int | None = 4096 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + bos_token_id: int | None = 199999 + eos_token_id: int | list[int] | None = None + pad_token_id: int | None = 199999 + sliding_window: int | None = None + sub_configs = {"audio_config": Phi4MultimodalAudioConfig, "vision_config": Phi4MultimodalVisionConfig} + vision_config: dict | PreTrainedConfig | None = None + audio_config: dict | PreTrainedConfig | None = None - def __init__( - self, - vocab_size: int | None = 200064, - hidden_size: int | None = 3072, - intermediate_size: int | None = 8192, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - resid_pdrop: float | None = 0.0, - embd_pdrop: float | None = 0.0, - attention_dropout: float | None = 0.0, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - bos_token_id: int | None = 199999, - eos_token_id: list[int] | None = [199999, 200020], - pad_token_id: int | None = 199999, - original_max_position_embeddings: int | None = 4096, - sliding_window: int | None = None, - vision_config: dict | None = None, - audio_config: dict | None = None, - **kwargs, - ): - if isinstance(vision_config, dict): - vision_config = Phi4MultimodalVisionConfig(**vision_config) - elif vision_config is None: - vision_config = Phi4MultimodalVisionConfig() - self.vision_config = vision_config - - if isinstance(audio_config, dict): - audio_config = Phi4MultimodalAudioConfig(**audio_config) - elif audio_config is None: - audio_config = Phi4MultimodalAudioConfig() - self.audio_config = audio_config - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attention_dropout = attention_dropout - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.original_max_position_embeddings = original_max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_parameters = rope_parameters - kwargs.setdefault("partial_rotary_factor", 1.0) # assign default for BC - self.sliding_window = sliding_window - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + def __post_init__(self, **kwargs): + self.eos_token_id = self.eos_token_id or [199999, 200020] + if isinstance(self.vision_config, dict): + self.vision_config = Phi4MultimodalVisionConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = Phi4MultimodalVisionConfig() + + if isinstance(self.audio_config, dict): + self.audio_config = Phi4MultimodalAudioConfig(**self.audio_config) + elif self.audio_config is None: + self.audio_config = Phi4MultimodalAudioConfig() + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) def convert_rope_params_to_dict( self, default_theta: int | float = 10_000.0, ignore_keys: set | None = None, **kwargs @@ -308,21 +243,20 @@ def convert_rope_params_to_dict( # Standardize and validate the correctness of rotary position embeddings parameters self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) - self.rope_parameters.setdefault("partial_rotary_factor", kwargs["partial_rotary_factor"]) + self.rope_parameters.setdefault("partial_rotary_factor", kwargs.get("partial_rotary_factor", 1.0)) self.standardize_rope_params() # For backward compatibility if previous version used "su" or "yarn" rope_parameters_type = self.rope_parameters.get("rope_type", None) if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: self.rope_parameters["rope_type"] = "longrope" - self.validate_rope(ignore_keys=ignore_keys) return kwargs - def validate_rope(self, ignore_keys: set | None = None): + def validate_rope(self): """ Validate the `rope_parameters` configuration. """ - super().validate_rope(ignore_keys=ignore_keys) + super().validate_rope() # Run Phi4Multimodal specific validation if not isinstance(self.rope_parameters, dict): diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py index 27d02b9ada7f..9e6c0339098d 100644 --- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py @@ -72,7 +72,7 @@ def simple_eager_attention_forward( value_states: torch.Tensor, attention_mask: torch.Tensor | None, scaling: float, - dropout: float = 0.0, + dropout: float | int = 0.0, **kwargs: Unpack[TransformersKwargs], ): attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * scaling diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py index ef6bf1588c47..ac56450700ce 100644 --- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py @@ -18,6 +18,7 @@ import numpy as np import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -31,7 +32,6 @@ BaseModelOutputWithPooling, CausalLMOutputWithPast, ) -from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import auto_docstring, logging @@ -65,6 +65,7 @@ @auto_docstring(checkpoint="microsoft/Phi-4-multimodal-instruct") +@strict(accept_kwargs=True) class Phi4MultimodalVisionConfig(SiglipVisionConfig): r""" crop_size (`int`, *optional*, defaults to 448): @@ -83,42 +84,19 @@ class Phi4MultimodalVisionConfig(SiglipVisionConfig): model_type = "phi4_multimodal_vision" - def __init__( - self, - hidden_size=1152, - intermediate_size=4304, - num_hidden_layers=27, - num_attention_heads=16, - num_channels=3, - image_size=448, - patch_size=14, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - crop_size: int = 448, - image_token_id: int = 200010, - feature_layer: int = -2, - **kwargs, - ): - super().__init__( - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_channels=num_channels, - image_size=image_size, - patch_size=patch_size, - hidden_act=hidden_act, - layer_norm_eps=layer_norm_eps, - attention_dropout=attention_dropout, - **kwargs, - ) - self.crop_size = crop_size - self.image_token_id = image_token_id - self.feature_layer = feature_layer + hidden_size: int = 1152 + intermediate_size: int = 4304 + num_hidden_layers: int = 27 + num_attention_heads: int = 16 + image_size: int | list[int] | tuple[int, int] = 448 + patch_size: int | list[int] | tuple[int, int] = 14 + crop_size: int = 448 + image_token_id: int = 200010 + feature_layer: int = -2 @auto_docstring(checkpoint="microsoft/Phi-4-multimodal-instruct") +@strict(accept_kwargs=True) class Phi4MultimodalAudioConfig(PreTrainedConfig): r""" num_blocks (`int`, *optional*, defaults to 24): @@ -171,69 +149,46 @@ class Phi4MultimodalAudioConfig(PreTrainedConfig): model_type = "phi4_multimodal_audio" - def __init__( - self, - hidden_size: int = 1024, - intermediate_size: int = 1536, - num_blocks: int = 24, - num_attention_heads: int = 16, - activation: str = "swish", - chunk_size: int = -1, - left_chunk: int = 18, - dropout_rate: float = 0.0, - ext_pw_out_channel: int = 1024, - depthwise_separable_out_channel: int = 1024, - depthwise_multiplier: int = 1, - kernel_size: int = 3, - conv_activation: str = "swish", - input_size: int = 80, - conv_glu_type: str = "swish", - time_reduction: int = 8, - bias_max_distance: int = 1000, - bias_symmetric: bool = False, - nemo_activation: str = "relu", - nemo_conv_channels: int = 1024, - downsample_rate: int = 1, - initializer_range: float = 0.02, - audio_token_id: int = 200011, - feature_layer: int = -2, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.activation = activation - self.chunk_size = chunk_size - self.left_chunk = left_chunk - self.num_blocks = num_blocks - self.dropout_rate = dropout_rate - self.ext_pw_out_channel = ext_pw_out_channel - self.depthwise_separable_out_channel = depthwise_separable_out_channel - self.depthwise_multiplier = depthwise_multiplier - self.kernel_size = kernel_size - self.conv_activation = conv_activation - self.input_size = input_size - self.conv_glu_type = conv_glu_type - self.time_reduction = time_reduction - self.bias_max_distance = bias_max_distance - self.bias_symmetric = bias_symmetric - self.nemo_activation = nemo_activation - self.nemo_conv_channels = nemo_conv_channels - self.downsample_rate = downsample_rate - self.audio_token_id = audio_token_id - self.initializer_range = initializer_range - self.feature_layer = feature_layer - - if time_reduction % 2 != 0: + hidden_size: int = 1024 + intermediate_size: int = 1536 + num_blocks: int = 24 + num_attention_heads: int = 16 + activation: str = "swish" + chunk_size: int = -1 + left_chunk: int = 18 + dropout_rate: float = 0.0 + ext_pw_out_channel: int = 1024 + depthwise_separable_out_channel: int = 1024 + depthwise_multiplier: int = 1 + kernel_size: int = 3 + conv_activation: str = "swish" + input_size: int = 80 + conv_glu_type: str = "swish" + time_reduction: int = 8 + bias_max_distance: int = 1000 + bias_symmetric: bool = False + nemo_activation: str = "relu" + nemo_conv_channels: int = 1024 + downsample_rate: int = 1 + initializer_range: float = 0.02 + audio_token_id: int = 200011 + feature_layer: int = -2 + + def __post_init__(self, **kwargs): + nemo_final_size = self.input_size + for _ in range(int(math.log2(self.time_reduction))): + nemo_final_size = math.floor((nemo_final_size - 1) / 2 + 1) + self.nemo_final_size = nemo_final_size + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.time_reduction % 2 != 0: raise ValueError("`time_reduction` should be a multiple of 2!") - length = input_size - for _ in range(int(math.log2(time_reduction))): - length = math.floor((length - 1) / 2 + 1) - self.nemo_final_size = length @auto_docstring(checkpoint="microsoft/Phi-4-multimodal-instruct") +@strict(accept_kwargs=True) class Phi4MultimodalConfig(Phi3Config): r""" original_max_position_embeddings (`int`, *optional*, defaults to 4096): @@ -257,69 +212,28 @@ class Phi4MultimodalConfig(Phi3Config): sub_configs = {"audio_config": Phi4MultimodalAudioConfig, "vision_config": Phi4MultimodalVisionConfig} - def __init__( - self, - vocab_size: int | None = 200064, - hidden_size: int | None = 3072, - intermediate_size: int | None = 8192, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - resid_pdrop: float | None = 0.0, - embd_pdrop: float | None = 0.0, - attention_dropout: float | None = 0.0, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 131072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - bos_token_id: int | None = 199999, - eos_token_id: list[int] | None = [199999, 200020], - pad_token_id: int | None = 199999, - original_max_position_embeddings: int | None = 4096, - sliding_window: int | None = None, - vision_config: dict | None = None, - audio_config: dict | None = None, - **kwargs, - ): - if isinstance(vision_config, dict): - vision_config = Phi4MultimodalVisionConfig(**vision_config) - elif vision_config is None: - vision_config = Phi4MultimodalVisionConfig() - self.vision_config = vision_config - - if isinstance(audio_config, dict): - audio_config = Phi4MultimodalAudioConfig(**audio_config) - elif audio_config is None: - audio_config = Phi4MultimodalAudioConfig() - self.audio_config = audio_config - - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - resid_pdrop=resid_pdrop, - embd_pdrop=embd_pdrop, - attention_dropout=attention_dropout, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, - tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - original_max_position_embeddings=original_max_position_embeddings, - sliding_window=sliding_window, - **kwargs, - ) + vocab_size: int = 200064 + num_key_value_heads: int | None = 8 + max_position_embeddings: int = 131072 + bos_token_id: int | None = 199999 + eos_token_id: int | list[int] | None = None + pad_token_id: int | None = 199999 + original_max_position_embeddings: int | None = 4096 + vision_config: dict | PreTrainedConfig | None = None + audio_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): + self.eos_token_id = self.eos_token_id or [199999, 200020] + if isinstance(self.vision_config, dict): + self.vision_config = Phi4MultimodalVisionConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = Phi4MultimodalVisionConfig() + + if isinstance(self.audio_config, dict): + self.audio_config = Phi4MultimodalAudioConfig(**self.audio_config) + elif self.audio_config is None: + self.audio_config = Phi4MultimodalAudioConfig() + super().__post_init__(**kwargs) class Phi4MultimodalVisionMLP(SiglipMLP): @@ -333,7 +247,7 @@ def simple_eager_attention_forward( value_states: torch.Tensor, attention_mask: torch.Tensor | None, scaling: float, - dropout: float = 0.0, + dropout: float | int = 0.0, **kwargs: Unpack[TransformersKwargs], ): attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * scaling diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index ef3ed1b1e50c..2930bd4c8b79 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -14,15 +14,15 @@ """PyTorch Phi-MoE model.""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/Phi-3.5-MoE-instruct") +@strict(accept_kwargs=True) class PhimoeConfig(PreTrainedConfig): r""" num_local_experts (`int`, *optional*, defaults to 16): @@ -46,75 +46,43 @@ class PhimoeConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] default_theta = 1000000.0 - def __init__( - self, - vocab_size: int | None = 32064, - hidden_size: int | None = 4096, - intermediate_size: int | None = 6400, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096 * 32, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - tie_word_embeddings: int | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - sliding_window: int | None = None, - attention_dropout: float | None = 0.0, - num_experts_per_tok: int | None = 2, - num_local_experts: int | None = 16, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - router_jitter_noise: float | None = 0.01, - input_jitter_noise: float | None = 0.0, - attention_bias: bool | None = False, - lm_head_bias: bool | None = False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - self.attention_bias = attention_bias - self.lm_head_bias = lm_head_bias - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - - self.num_experts_per_tok = num_experts_per_tok - self.num_local_experts = num_local_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.router_jitter_noise = router_jitter_noise - self.input_jitter_noise = input_jitter_noise - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) - - def validate_rope(self, ignore_keys=None): + vocab_size: int = 32064 + hidden_size: int = 4096 + intermediate_size: int = 6400 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 4096 * 32 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + tie_word_embeddings: int = False + rope_parameters: RopeParameters | dict | None = None + sliding_window: int | None = None + attention_dropout: float | int = 0.0 + num_experts_per_tok: int = 2 + num_local_experts: int = 16 + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + router_jitter_noise: float = 0.01 + input_jitter_noise: float = 0.0 + attention_bias: bool = False + lm_head_bias: bool = False + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + super().__post_init__(**kwargs) + + def validate_rope(self): """ Validate the `rope_parameters` configuration. """ - super().validate_rope(ignore_keys=ignore_keys) + super().validate_rope() # Run model-specific rope validation if self.rope_parameters["rope_type"] != "default": diff --git a/src/transformers/models/pi0/configuration_pi0.py b/src/transformers/models/pi0/configuration_pi0.py index fa4406a48713..a6b064989fb1 100644 --- a/src/transformers/models/pi0/configuration_pi0.py +++ b/src/transformers/models/pi0/configuration_pi0.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="lerobot/pi0_base") +@strict(accept_kwargs=True) class PI0Config(PreTrainedConfig): r""" vlm_config (`dict`, *optional*): @@ -66,30 +69,26 @@ class PI0Config(PreTrainedConfig): model_type = "pi0" sub_configs = {"vlm_config": AutoConfig, "dit_config": AutoConfig} - def __init__( - self, - vlm_config=None, - dit_config=None, - image_token_id=257152, - vlm_projection_dim=2048, - chunk_size=50, - max_state_dim=32, - max_action_dim=32, - num_inference_steps=10, - time_sampling_beta_alpha=1.5, - time_sampling_beta_beta=1.0, - time_sampling_scale=0.999, - time_sampling_offset=0.001, - min_period=4e-3, - max_period=4.0, - loss_reduction="mean", - **kwargs, - ): - if isinstance(vlm_config, dict): - vlm_model_type = vlm_config.get("model_type", "paligemma") - vlm_config = CONFIG_MAPPING[vlm_model_type](**vlm_config) - elif vlm_config is None: - vlm_config = CONFIG_MAPPING["paligemma"]( + vlm_config: dict | PreTrainedConfig | None = None + dit_config: dict | PreTrainedConfig | None = None + chunk_size: int = 50 + max_state_dim: int = 32 + max_action_dim: int = 32 + num_inference_steps: int = 10 + time_sampling_beta_alpha: float = 1.5 + time_sampling_beta_beta: float = 1.0 + time_sampling_scale: float = 0.999 + time_sampling_offset: float = 0.001 + min_period: float = 4e-3 + max_period: float = 4.0 + loss_reduction: str = "mean" + + def __post_init__(self, **kwargs): + if isinstance(self.vlm_config, dict): + vlm_model_type = self.vlm_config.get("model_type", "paligemma") + self.vlm_config = CONFIG_MAPPING[vlm_model_type](**self.vlm_config) + elif self.vlm_config is None: + self.vlm_config = CONFIG_MAPPING["paligemma"]( text_config={ "model_type": "gemma", "hidden_size": 2048, @@ -110,47 +109,34 @@ def __init__( "vocab_size": 257152, "vision_use_head": False, }, - projection_dim=vlm_projection_dim, - image_token_id=image_token_id, + projection_dim=2048, + image_token_id=257152, ) - if isinstance(dit_config, dict): - dit_model_type = dit_config.get("model_type", "gemma") - dit_config = CONFIG_MAPPING[dit_model_type](**dit_config) - elif dit_config is None: - dit_config = CONFIG_MAPPING["gemma"]( + if isinstance(self.dit_config, dict): + dit_model_type = self.dit_config.get("model_type", "gemma") + self.dit_config = CONFIG_MAPPING[dit_model_type](**self.dit_config) + elif self.dit_config is None: + self.dit_config = CONFIG_MAPPING["gemma"]( hidden_size=1024, num_hidden_layers=18, intermediate_size=4096, num_attention_heads=8, num_key_value_heads=1, head_dim=256, - vocab_size=vlm_config.text_config.vocab_size, + vocab_size=self.vlm_config.text_config.vocab_size, ) - self.dit_config = dit_config - self.vlm_config = vlm_config - # Force bidirectional attention self.dit_config.is_causal = False self.dit_config.use_bidirectional_attention = True self.vlm_config.text_config.use_bidirectional_attention = True + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if self.dit_config.hidden_size % 2 != 0: raise ValueError(f"DiT hidden dim=({self.config.dit_config.hidden_size}) must be divisible by 2") - self.chunk_size = chunk_size - self.max_state_dim = max_state_dim - self.max_action_dim = max_action_dim - self.num_inference_steps = num_inference_steps - self.time_sampling_beta_alpha = time_sampling_beta_alpha - self.time_sampling_beta_beta = time_sampling_beta_beta - self.time_sampling_scale = time_sampling_scale - self.time_sampling_offset = time_sampling_offset - self.min_period = min_period - self.max_period = max_period - self.loss_reduction = loss_reduction - super().__init__(**kwargs) - __all__ = ["PI0Config"] diff --git a/src/transformers/models/pi0/modular_pi0.py b/src/transformers/models/pi0/modular_pi0.py index 89f76cfcef6b..8c93c2522d68 100644 --- a/src/transformers/models/pi0/modular_pi0.py +++ b/src/transformers/models/pi0/modular_pi0.py @@ -19,6 +19,7 @@ import numpy as np import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -171,6 +172,7 @@ def model_input_names(self): @auto_docstring(checkpoint="lerobot/pi0_base") +@strict(accept_kwargs=True) class PI0Config(PreTrainedConfig): r""" vlm_config (`dict`, *optional*): @@ -214,30 +216,26 @@ class PI0Config(PreTrainedConfig): model_type = "pi0" sub_configs = {"vlm_config": AutoConfig, "dit_config": AutoConfig} - def __init__( - self, - vlm_config=None, - dit_config=None, - image_token_id=257152, - vlm_projection_dim=2048, - chunk_size=50, - max_state_dim=32, - max_action_dim=32, - num_inference_steps=10, - time_sampling_beta_alpha=1.5, - time_sampling_beta_beta=1.0, - time_sampling_scale=0.999, - time_sampling_offset=0.001, - min_period=4e-3, - max_period=4.0, - loss_reduction="mean", - **kwargs, - ): - if isinstance(vlm_config, dict): - vlm_model_type = vlm_config.get("model_type", "paligemma") - vlm_config = CONFIG_MAPPING[vlm_model_type](**vlm_config) - elif vlm_config is None: - vlm_config = CONFIG_MAPPING["paligemma"]( + vlm_config: dict | PreTrainedConfig | None = None + dit_config: dict | PreTrainedConfig | None = None + chunk_size: int = 50 + max_state_dim: int = 32 + max_action_dim: int = 32 + num_inference_steps: int = 10 + time_sampling_beta_alpha: float = 1.5 + time_sampling_beta_beta: float = 1.0 + time_sampling_scale: float = 0.999 + time_sampling_offset: float = 0.001 + min_period: float = 4e-3 + max_period: float = 4.0 + loss_reduction: str = "mean" + + def __post_init__(self, **kwargs): + if isinstance(self.vlm_config, dict): + vlm_model_type = self.vlm_config.get("model_type", "paligemma") + self.vlm_config = CONFIG_MAPPING[vlm_model_type](**self.vlm_config) + elif self.vlm_config is None: + self.vlm_config = CONFIG_MAPPING["paligemma"]( text_config={ "model_type": "gemma", "hidden_size": 2048, @@ -258,48 +256,35 @@ def __init__( "vocab_size": 257152, "vision_use_head": False, }, - projection_dim=vlm_projection_dim, - image_token_id=image_token_id, + projection_dim=2048, + image_token_id=257152, ) - if isinstance(dit_config, dict): - dit_model_type = dit_config.get("model_type", "gemma") - dit_config = CONFIG_MAPPING[dit_model_type](**dit_config) - elif dit_config is None: - dit_config = CONFIG_MAPPING["gemma"]( + if isinstance(self.dit_config, dict): + dit_model_type = self.dit_config.get("model_type", "gemma") + self.dit_config = CONFIG_MAPPING[dit_model_type](**self.dit_config) + elif self.dit_config is None: + self.dit_config = CONFIG_MAPPING["gemma"]( hidden_size=1024, num_hidden_layers=18, intermediate_size=4096, num_attention_heads=8, num_key_value_heads=1, head_dim=256, - vocab_size=vlm_config.text_config.vocab_size, + vocab_size=self.vlm_config.text_config.vocab_size, ) - self.dit_config = dit_config - self.vlm_config = vlm_config - # Force bidirectional attention self.dit_config.is_causal = False self.dit_config.use_bidirectional_attention = True self.vlm_config.text_config.use_bidirectional_attention = True + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if self.dit_config.hidden_size % 2 != 0: raise ValueError(f"DiT hidden dim=({self.config.dit_config.hidden_size}) must be divisible by 2") - self.chunk_size = chunk_size - self.max_state_dim = max_state_dim - self.max_action_dim = max_action_dim - self.num_inference_steps = num_inference_steps - self.time_sampling_beta_alpha = time_sampling_beta_alpha - self.time_sampling_beta_beta = time_sampling_beta_beta - self.time_sampling_scale = time_sampling_scale - self.time_sampling_offset = time_sampling_offset - self.min_period = min_period - self.max_period = max_period - self.loss_reduction = loss_reduction - super().__init__(**kwargs) - def blockwise_bidirectional_mask(block_boundaries: torch.Tensor) -> Callable: def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool: diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py index d6cd507ed807..7064891e0c4e 100644 --- a/src/transformers/models/pix2struct/configuration_pix2struct.py +++ b/src/transformers/models/pix2struct/configuration_pix2struct.py @@ -13,6 +13,8 @@ # limitations under the License. """Pix2Struct model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="google/pix2struct-base") +@strict(accept_kwargs=True) class Pix2StructTextConfig(PreTrainedConfig): r""" relative_attention_num_buckets (`int`, *optional*, defaults to 32): @@ -57,60 +60,30 @@ class Pix2StructTextConfig(PreTrainedConfig): "decoder_layers": "num_layers", } - def __init__( - self, - vocab_size=50244, - hidden_size=768, - d_kv=64, - d_ff=2048, - num_layers=12, - num_heads=12, - relative_attention_num_buckets=32, - relative_attention_max_distance=128, - dropout_rate=0.1, - layer_norm_epsilon=1e-6, - initializer_factor=1.0, - dense_act_fn="gelu_new", - decoder_start_token_id=0, - use_cache=False, - pad_token_id=0, - eos_token_id=1, - bos_token_id=None, - tie_word_embeddings=False, - is_decoder=True, - add_cross_attention=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.d_kv = d_kv - self.d_ff = d_ff - self.num_layers = num_layers - self.num_heads = num_heads - self.relative_attention_num_buckets = relative_attention_num_buckets - self.relative_attention_max_distance = relative_attention_max_distance - self.dropout_rate = dropout_rate - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_factor = initializer_factor - self.use_cache = use_cache - - self.eos_token_id = eos_token_id - self.bos_token_id = bos_token_id - self.decoder_start_token_id = decoder_start_token_id - - # for backwards compatibility - self.dense_act_fn = dense_act_fn - - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = tie_word_embeddings - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - super().__init__(**kwargs) + vocab_size: int = 50244 + hidden_size: int = 768 + d_kv: int = 64 + d_ff: int = 2048 + num_layers: int = 12 + num_heads: int = 12 + relative_attention_num_buckets: int = 32 + relative_attention_max_distance: int = 128 + dropout_rate: float = 0.1 + layer_norm_epsilon: float = 1e-6 + initializer_factor: float = 1.0 + dense_act_fn: str = "gelu_new" + decoder_start_token_id: int = 0 + use_cache: bool = False + pad_token_id: int | None = 0 + eos_token_id: int | None = 1 + bos_token_id: int | None = None + tie_word_embeddings: bool = False + is_decoder: bool = True + add_cross_attention: bool = False @auto_docstring(checkpoint="google/pix2struct-base") +@strict(accept_kwargs=True) class Pix2StructVisionConfig(PreTrainedConfig): r""" dense_act_fn (`Union[Callable, str]`, *optional*, defaults to `"gelu_new"`): @@ -147,45 +120,25 @@ class Pix2StructVisionConfig(PreTrainedConfig): model_type = "pix2struct_vision_model" - def __init__( - self, - hidden_size=768, - patch_embed_hidden_size=768, - d_ff=2048, - d_kv=64, - num_hidden_layers=12, - num_attention_heads=12, - dense_act_fn="gelu_new", - layer_norm_eps=1e-6, - dropout_rate=0.0, - attention_dropout=0.0, - initializer_range=1e-10, - initializer_factor=1.0, - seq_len=4096, - relative_attention_num_buckets=32, - relative_attention_max_distance=128, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.patch_embed_hidden_size = patch_embed_hidden_size - self.d_ff = d_ff - self.dropout_rate = dropout_rate - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.dense_act_fn = dense_act_fn - self.seq_len = seq_len - self.relative_attention_num_buckets = relative_attention_num_buckets - self.relative_attention_max_distance = relative_attention_max_distance - self.d_kv = d_kv + hidden_size: int = 768 + patch_embed_hidden_size: int = 768 + d_ff: int = 2048 + d_kv: int = 64 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + dense_act_fn: str = "gelu_new" + layer_norm_eps: float = 1e-6 + dropout_rate: float = 0.0 + attention_dropout: float | int = 0.0 + initializer_range: float = 1e-10 + initializer_factor: float = 1.0 + seq_len: int = 4096 + relative_attention_num_buckets: int = 32 + relative_attention_max_distance: int = 128 @auto_docstring(checkpoint="google/pix2struct-base") +@strict(accept_kwargs=True) class Pix2StructConfig(PreTrainedConfig): r""" is_vqa (`bool`, *optional*, defaults to `False`): @@ -217,49 +170,40 @@ class Pix2StructConfig(PreTrainedConfig): model_type = "pix2struct" sub_configs = {"text_config": Pix2StructTextConfig, "vision_config": Pix2StructVisionConfig} - def __init__( - self, - text_config=None, - vision_config=None, - initializer_factor=1.0, - initializer_range=0.02, - is_vqa=False, - tie_word_embeddings=False, - is_encoder_decoder=True, - **kwargs, - ): - if text_config is None: - text_config = Pix2StructTextConfig( - {"is_encoder_decoder": is_encoder_decoder, "tie_word_embeddings": tie_word_embeddings} + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + initializer_factor: float = 1.0 + initializer_range: float = 0.02 + is_vqa: bool = False + tie_word_embeddings: bool = False + is_encoder_decoder: bool = True + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = Pix2StructTextConfig( + is_encoder_decoder=self.is_encoder_decoder, + tie_word_embeddings=self.tie_word_embeddings, ) logger.info("`text_config` is `None`. initializing the `Pix2StructTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config["is_encoder_decoder"] = is_encoder_decoder - text_config["tie_word_embeddings"] = tie_word_embeddings - text_config = Pix2StructTextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config["is_encoder_decoder"] = self.is_encoder_decoder + self.text_config["tie_word_embeddings"] = self.tie_word_embeddings + self.text_config = Pix2StructTextConfig(**self.text_config) - if vision_config is None: - vision_config = Pix2StructVisionConfig() + if self.vision_config is None: + self.vision_config = Pix2StructVisionConfig() logger.info("`vision_config` is `None`. initializing the `Pix2StructVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = Pix2StructVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config + elif isinstance(self.vision_config, dict): + self.vision_config = Pix2StructVisionConfig(**self.vision_config) self.decoder_start_token_id = self.text_config.decoder_start_token_id self.pad_token_id = self.text_config.pad_token_id self.eos_token_id = self.text_config.eos_token_id - self.initializer_factor = initializer_factor - self.initializer_range = initializer_range - self.text_config.initializer_range = self.initializer_range self.vision_config.initializer_range = self.initializer_range - self.is_vqa = is_vqa - self.tie_word_embeddings = tie_word_embeddings - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__post_init__(**kwargs) __all__ = ["Pix2StructConfig", "Pix2StructTextConfig", "Pix2StructVisionConfig"] diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index 8f7bcee2c46c..896a48d87234 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -508,7 +508,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if flattened_patches is None: raise ValueError("You have to specify flattened_patches") @@ -1004,7 +1004,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if self.gradient_checkpointing and self.training and use_cache: logger.warning( @@ -1280,7 +1280,7 @@ def forward( 5.94282 ```""" use_cache = use_cache if use_cache is not None else self.config.text_config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: diff --git a/src/transformers/models/pixio/configuration_pixio.py b/src/transformers/models/pixio/configuration_pixio.py index ecaf8841229c..58a8840dd4c5 100644 --- a/src/transformers/models/pixio/configuration_pixio.py +++ b/src/transformers/models/pixio/configuration_pixio.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/pixio-huge") +@strict(accept_kwargs=True) class PixioConfig(BackboneConfigMixin, PreTrainedConfig): r""" n_cls_tokens (`int`, *optional*, defaults to 8): @@ -51,51 +54,32 @@ class PixioConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "pixio" - def __init__( - self, - hidden_size=1280, - num_hidden_layers=32, - num_attention_heads=16, - mlp_ratio=4, - n_cls_tokens=8, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=256, - patch_size=16, - num_channels=3, - qkv_bias=True, - drop_path_rate=0.0, - out_features=None, - out_indices=None, - apply_layernorm=True, - reshape_hidden_states=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mlp_ratio = mlp_ratio - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.drop_path_rate = drop_path_rate - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) - self.apply_layernorm = apply_layernorm - self.reshape_hidden_states = reshape_hidden_states + hidden_size: int = 1280 + num_hidden_layers: int = 32 + num_attention_heads: int = 16 + mlp_ratio: int = 4 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + image_size: int | list[int] | tuple[int, int] = 256 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + drop_path_rate: float = 0.0 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + apply_layernorm: bool = True + reshape_hidden_states: bool = True + n_cls_tokens: int = 8 - self.n_cls_tokens = n_cls_tokens + def __post_init__(self, **kwargs): + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["PixioConfig"] diff --git a/src/transformers/models/pixio/modular_pixio.py b/src/transformers/models/pixio/modular_pixio.py index 3446ed222283..709db52ae87b 100644 --- a/src/transformers/models/pixio/modular_pixio.py +++ b/src/transformers/models/pixio/modular_pixio.py @@ -14,12 +14,13 @@ """PyTorch Pixio model.""" import torch +from huggingface_hub.dataclasses import strict from torch import nn from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, is_tracing, logging +from ...utils import TransformersKwargs, auto_docstring, is_tracing from ...utils.generic import can_return_tuple, merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..dinov2.configuration_dinov2 import Dinov2Config @@ -31,10 +32,8 @@ from ..vit.modeling_vit import ViTAttention, ViTPatchEmbeddings, ViTPreTrainedModel, ViTSelfAttention -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="facebook/pixio-huge") +@strict(accept_kwargs=True) class PixioConfig(Dinov2Config): r""" n_cls_tokens (`int`, *optional*, defaults to 8): @@ -63,53 +62,16 @@ class PixioConfig(Dinov2Config): model_type = "pixio" - def __init__( - self, - hidden_size=1280, - num_hidden_layers=32, - num_attention_heads=16, - mlp_ratio=4, - n_cls_tokens=8, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=256, - patch_size=16, - num_channels=3, - qkv_bias=True, - drop_path_rate=0.0, - out_features=None, - out_indices=None, - apply_layernorm=True, - reshape_hidden_states=True, - **kwargs, - ): - super().__init__( - hidden_size=hidden_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - mlp_ratio=mlp_ratio, - hidden_act=hidden_act, - hidden_dropout_prob=hidden_dropout_prob, - attention_probs_dropout_prob=attention_probs_dropout_prob, - initializer_range=initializer_range, - layer_norm_eps=layer_norm_eps, - image_size=image_size, - patch_size=patch_size, - num_channels=num_channels, - qkv_bias=qkv_bias, - drop_path_rate=drop_path_rate, - apply_layernorm=apply_layernorm, - reshape_hidden_states=reshape_hidden_states, - ) - - self.n_cls_tokens = n_cls_tokens + hidden_size: int = 1280 + num_hidden_layers: int = 32 + num_attention_heads: int = 16 + n_cls_tokens: int = 8 + image_size: int | list[int] | tuple[int, int] = 256 + patch_size: int | list[int] | tuple[int, int] = 16 - del self.layerscale_value - del self.use_swiglu_ffn - del self.use_mask_token + layerscale_value = AttributeError() + use_swiglu_ffn = AttributeError() + use_mask_token = AttributeError() class PixioPatchEmbeddings(ViTPatchEmbeddings): diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index 3c1913a69ab9..66b6ac40763e 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -12,15 +12,15 @@ # limitations under the License. """Pixtral model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="mistral-labs/pixtral-12b") +@strict(accept_kwargs=True) class PixtralVisionConfig(PreTrainedConfig): r""" Example: @@ -40,35 +40,21 @@ class PixtralVisionConfig(PreTrainedConfig): model_type = "pixtral" - def __init__( - self, - hidden_size: int | None = 1024, - intermediate_size: int | None = 4096, - num_hidden_layers: int | None = 24, - num_attention_heads: int | None = 16, - num_channels: int | None = 3, - image_size: int | None = 1024, - patch_size: int | None = 16, - hidden_act: str | None = "gelu", - attention_dropout: float | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - initializer_range: float | None = 0.02, - **kwargs, - ): - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.hidden_act = hidden_act - self.head_dim = hidden_size // num_attention_heads - self.initializer_range = initializer_range - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + hidden_size: int = 1024 + intermediate_size: int = 4096 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu" + attention_dropout: float | int = 0.0 + rope_parameters: RopeParameters | dict | None = None + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + self.head_dim = self.hidden_size // self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["PixtralVisionConfig"] diff --git a/src/transformers/models/plbart/configuration_plbart.py b/src/transformers/models/plbart/configuration_plbart.py index 6b7680055263..01bb243d3bce 100644 --- a/src/transformers/models/plbart/configuration_plbart.py +++ b/src/transformers/models/plbart/configuration_plbart.py @@ -13,14 +13,14 @@ # limitations under the License. """PLBART model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="uclanlp/plbart-base") +@strict(accept_kwargs=True) class PLBartConfig(PreTrainedConfig): r""" Example: @@ -44,68 +44,35 @@ class PLBartConfig(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model", "initializer_range": "init_std", + "num_hidden_layers": "encoder_layers", } - def __init__( - self, - vocab_size=50005, - max_position_embeddings=1024, - encoder_layers=6, - encoder_ffn_dim=3072, - encoder_attention_heads=12, - decoder_layers=6, - decoder_ffn_dim=3072, - decoder_attention_heads=12, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - use_cache=True, - is_encoder_decoder=True, - activation_function="gelu", - d_model=768, - dropout=0.1, - attention_dropout=0.1, - activation_dropout=0.0, - init_std=0.02, - classifier_dropout=0.0, - scale_embedding=True, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - forced_eos_token_id=2, - is_decoder=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.classifier_dropout = classifier_dropout - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.forced_eos_token_id = forced_eos_token_id - super().__init__( - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) + vocab_size: int = 50005 + max_position_embeddings: int = 1024 + encoder_layers: int = 6 + encoder_ffn_dim: int = 3072 + encoder_attention_heads: int = 12 + decoder_layers: int = 6 + decoder_ffn_dim: int = 3072 + decoder_attention_heads: int = 12 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "gelu" + d_model: int = 768 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + classifier_dropout: float | int = 0.0 + scale_embedding: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + forced_eos_token_id: int | None = 2 + is_decoder: bool = False + tie_word_embeddings: bool = True __all__ = ["PLBartConfig"] diff --git a/src/transformers/models/poolformer/configuration_poolformer.py b/src/transformers/models/poolformer/configuration_poolformer.py index 37009398bd3e..83e63d14d092 100644 --- a/src/transformers/models/poolformer/configuration_poolformer.py +++ b/src/transformers/models/poolformer/configuration_poolformer.py @@ -13,14 +13,14 @@ # limitations under the License. """PoolFormer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="sail/poolformer_s12") +@strict(accept_kwargs=True) class PoolFormerConfig(PreTrainedConfig): r""" stride (`int`, *optional*, defaults to 16): @@ -56,43 +56,22 @@ class PoolFormerConfig(PreTrainedConfig): model_type = "poolformer" - def __init__( - self, - num_channels=3, - patch_size=16, - stride=16, - pool_size=3, - mlp_ratio=4.0, - depths=[2, 2, 6, 2], - hidden_sizes=[64, 128, 320, 512], - patch_sizes=[7, 3, 3, 3], - strides=[4, 2, 2, 2], - padding=[2, 1, 1, 1], - num_encoder_blocks=4, - drop_path_rate=0.0, - hidden_act="gelu", - use_layer_scale=True, - layer_scale_init_value=1e-5, - initializer_range=0.02, - **kwargs, - ): - self.num_channels = num_channels - self.patch_size = patch_size - self.stride = stride - self.padding = padding - self.pool_size = pool_size - self.hidden_sizes = hidden_sizes - self.mlp_ratio = mlp_ratio - self.depths = depths - self.patch_sizes = patch_sizes - self.strides = strides - self.num_encoder_blocks = num_encoder_blocks - self.drop_path_rate = drop_path_rate - self.hidden_act = hidden_act - self.use_layer_scale = use_layer_scale - self.layer_scale_init_value = layer_scale_init_value - self.initializer_range = initializer_range - super().__init__(**kwargs) + num_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 16 + stride: int = 16 + pool_size: int = 3 + mlp_ratio: float = 4.0 + depths: list[int] | tuple[int, ...] = (2, 2, 6, 2) + hidden_sizes: list[int] | tuple[int, ...] = (64, 128, 320, 512) + patch_sizes: list[int] | tuple[int, ...] = (7, 3, 3, 3) + strides: list[int] | tuple[int, ...] = (4, 2, 2, 2) + padding: list[int] | tuple[int, ...] = (2, 1, 1, 1) + num_encoder_blocks: int = 4 + drop_path_rate: float = 0.0 + hidden_act: str = "gelu" + use_layer_scale: bool = True + layer_scale_init_value: float = 1e-5 + initializer_range: float = 0.02 __all__ = ["PoolFormerConfig"] diff --git a/src/transformers/models/poolformer/modeling_poolformer.py b/src/transformers/models/poolformer/modeling_poolformer.py index 52f8e4843ab9..4dd2b52fffa2 100755 --- a/src/transformers/models/poolformer/modeling_poolformer.py +++ b/src/transformers/models/poolformer/modeling_poolformer.py @@ -283,7 +283,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -356,7 +356,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.poolformer( pixel_values, diff --git a/src/transformers/models/pop2piano/configuration_pop2piano.py b/src/transformers/models/pop2piano/configuration_pop2piano.py index 2a98116921f0..1e063ba5015b 100644 --- a/src/transformers/models/pop2piano/configuration_pop2piano.py +++ b/src/transformers/models/pop2piano/configuration_pop2piano.py @@ -13,14 +13,14 @@ # limitations under the License. """Pop2Piano model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="sweetcocoa/pop2piano") +@strict(accept_kwargs=True) class Pop2PianoConfig(PreTrainedConfig): r""" composer_vocab_size (`int`, *optional*, defaults to 21): @@ -37,58 +37,34 @@ class Pop2PianoConfig(PreTrainedConfig): model_type = "pop2piano" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_hidden_layers": "num_layers", "hidden_size": "d_model", "num_attention_heads": "num_heads"} - def __init__( - self, - vocab_size=2400, - composer_vocab_size=21, - d_model=512, - d_kv=64, - d_ff=2048, - num_layers=6, - num_decoder_layers=None, - num_heads=8, - relative_attention_num_buckets=32, - relative_attention_max_distance=128, - dropout_rate=0.1, - layer_norm_epsilon=1e-6, - initializer_factor=1.0, - feed_forward_proj="gated-gelu", - is_encoder_decoder=True, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - dense_act_fn="relu", - is_decoder=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.composer_vocab_size = composer_vocab_size - self.d_model = d_model - self.d_kv = d_kv - self.d_ff = d_ff - self.num_layers = num_layers - self.num_decoder_layers = num_decoder_layers if num_decoder_layers is not None else self.num_layers - self.num_heads = num_heads - self.relative_attention_num_buckets = relative_attention_num_buckets - self.relative_attention_max_distance = relative_attention_max_distance - self.dropout_rate = dropout_rate - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_factor = initializer_factor - self.feed_forward_proj = feed_forward_proj - self.use_cache = use_cache - self.dense_act_fn = dense_act_fn - self.is_gated_act = self.feed_forward_proj.split("-")[0] == "gated" - self.hidden_size = self.d_model - self.num_attention_heads = num_heads - self.num_hidden_layers = num_layers - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id + vocab_size: int = 2400 + composer_vocab_size: int = 21 + d_model: int = 512 + d_kv: int = 64 + d_ff: int = 2048 + num_layers: int = 6 + num_decoder_layers: int | None = None + num_heads: int = 8 + relative_attention_num_buckets: int = 32 + relative_attention_max_distance: int = 128 + dropout_rate: float = 0.1 + layer_norm_epsilon: float = 1e-6 + initializer_factor: float = 1.0 + feed_forward_proj: str = "gated-gelu" + is_encoder_decoder: bool = True + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | None = 1 + dense_act_fn: str = "relu" + is_decoder: bool = False + tie_word_embeddings: bool = True - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + def __post_init__(self, **kwargs): + self.num_decoder_layers = self.num_decoder_layers if self.num_decoder_layers is not None else self.num_layers + self.is_gated_act = self.feed_forward_proj.split("-")[0] == "gated" + super().__post_init__(**kwargs) __all__ = ["Pop2PianoConfig"] diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py index 39f577893d86..08595168ab01 100644 --- a/src/transformers/models/pop2piano/modeling_pop2piano.py +++ b/src/transformers/models/pop2piano/modeling_pop2piano.py @@ -624,7 +624,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: err_msg_prefix = "decoder_" if self.is_decoder else "" @@ -914,7 +914,7 @@ def forward( labels in `[0, ..., config.vocab_size]` """ use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if inputs_embeds is not None and input_features is not None: raise ValueError("Both `inputs_embeds` and `input_features` received! Please provide only one of them") diff --git a/src/transformers/models/pp_doclayout_v2/configuration_pp_doclayout_v2.py b/src/transformers/models/pp_doclayout_v2/configuration_pp_doclayout_v2.py index 260430767a09..494cf8def378 100644 --- a/src/transformers/models/pp_doclayout_v2/configuration_pp_doclayout_v2.py +++ b/src/transformers/models/pp_doclayout_v2/configuration_pp_doclayout_v2.py @@ -18,6 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -25,6 +27,7 @@ @auto_docstring(checkpoint="PaddlePaddle/PP-DocLayoutV2_safetensors") +@strict(accept_kwargs=True) class PPDocLayoutV2ReadingOrderConfig(PreTrainedConfig): r""" has_relative_attention_bias (`bool`, *optional*, defaults to `True`): @@ -66,77 +69,41 @@ class PPDocLayoutV2ReadingOrderConfig(PreTrainedConfig): The dropout probability in the global pointer head. """ - def __init__( - self, - hidden_size=512, - num_attention_heads=8, - attention_probs_dropout_prob=0.1, - has_relative_attention_bias=False, - has_spatial_attention_bias=True, - layer_norm_eps=1e-5, - hidden_dropout_prob=0.1, - intermediate_size=2048, - hidden_act="gelu", - num_hidden_layers=6, - rel_pos_bins=32, - max_rel_pos=128, - rel_2d_pos_bins=64, - max_rel_2d_pos=256, - max_position_embeddings=514, - max_2d_position_embeddings=1024, - type_vocab_size=1, - vocab_size=4, - initializer_range=0.01, - start_token_id=0, - pad_token_id=1, - end_token_id=2, - pred_token_id=3, - coordinate_size=171, - shape_size=170, - num_classes=20, - relation_bias_embed_dim=16, - relation_bias_theta=10000, - relation_bias_scale=100, - global_pointer_head_size=64, - gp_dropout_value=0.0, - **kwargs, - ): - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.has_relative_attention_bias = has_relative_attention_bias - self.has_spatial_attention_bias = has_spatial_attention_bias - self.layer_norm_eps = layer_norm_eps - self.hidden_dropout_prob = hidden_dropout_prob - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_hidden_layers = num_hidden_layers - self.rel_pos_bins = rel_pos_bins - self.max_rel_pos = max_rel_pos - self.rel_2d_pos_bins = rel_2d_pos_bins - self.max_rel_2d_pos = max_rel_2d_pos - self.max_position_embeddings = max_position_embeddings - self.max_2d_position_embeddings = max_2d_position_embeddings - self.type_vocab_size = type_vocab_size - self.vocab_size = vocab_size - self.initializer_range = initializer_range - self.start_token_id = start_token_id - self.pad_token_id = pad_token_id - self.end_token_id = end_token_id - self.pred_token_id = pred_token_id - self.coordinate_size = coordinate_size - self.shape_size = shape_size - self.num_classes = num_classes - self.relation_bias_embed_dim = relation_bias_embed_dim - self.relation_bias_theta = relation_bias_theta - self.relation_bias_scale = relation_bias_scale - self.global_pointer_head_size = global_pointer_head_size - self.gp_dropout_value = gp_dropout_value - - super().__init__(**kwargs) + hidden_size: int = 512 + num_attention_heads: int = 8 + attention_probs_dropout_prob: float = 0.1 + has_relative_attention_bias: bool = False + has_spatial_attention_bias: bool = True + layer_norm_eps: float = 1e-5 + hidden_dropout_prob: float = 0.1 + intermediate_size: int = 2048 + hidden_act: str = "gelu" + num_hidden_layers: int = 6 + rel_pos_bins: int = 32 + max_rel_pos: int = 128 + rel_2d_pos_bins: int = 64 + max_rel_2d_pos: int = 256 + max_position_embeddings: int = 514 + max_2d_position_embeddings: int = 1024 + type_vocab_size: int = 1 + vocab_size: int = 4 + initializer_range: float = 0.01 + start_token_id: int = 0 + pad_token_id: int | None = 1 + end_token_id: int = 2 + pred_token_id: int = 3 + coordinate_size: int = 171 + shape_size: int = 170 + num_classes: int = 20 + relation_bias_embed_dim: int = 16 + relation_bias_theta: int = 10000 + relation_bias_scale: int = 100 + global_pointer_head_size: int = 64 + gp_dropout_value: float = 0.0 @auto_docstring(checkpoint="PaddlePaddle/PP-DocLayoutV2_safetensors") +@strict(accept_kwargs=True) class PPDocLayoutV2Config(PreTrainedConfig): r""" initializer_bias_prior_prob (`float`, *optional*): @@ -223,67 +190,56 @@ class PPDocLayoutV2Config(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - initializer_range=0.01, - initializer_bias_prior_prob=None, - layer_norm_eps=1e-5, - batch_norm_eps=1e-5, - # backbone - backbone_config=None, - freeze_backbone_batch_norms=True, - # encoder HybridEncoder - encoder_hidden_dim=256, - encoder_in_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - encoder_layers=1, - encoder_ffn_dim=1024, - encoder_attention_heads=8, - dropout=0.0, - activation_dropout=0.0, - encode_proj_layers=[2], - positional_encoding_temperature=10000, - encoder_activation_function="gelu", - activation_function="silu", - eval_size=None, - normalize_before=False, - hidden_expansion=1.0, - # decoder PPDocLayoutV2Transformer - d_model=256, - num_queries=300, - decoder_in_channels=[256, 256, 256], - decoder_ffn_dim=1024, - num_feature_levels=3, - decoder_n_points=4, - decoder_layers=6, - decoder_attention_heads=8, - decoder_activation_function="relu", - attention_dropout=0.0, - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0, - learn_initial_query=False, - anchor_image_size=None, - disable_custom_kernels=True, - is_encoder_decoder=True, - # label - class_thresholds=None, - class_order=None, - reading_order_config=None, - **kwargs, - ): - self.initializer_range = initializer_range - self.initializer_bias_prior_prob = initializer_bias_prior_prob - self.layer_norm_eps = layer_norm_eps - self.batch_norm_eps = batch_norm_eps + initializer_range: float = 0.01 + initializer_bias_prior_prob: float | None = None + layer_norm_eps: float = 1e-5 + batch_norm_eps: float = 1e-5 + backbone_config: PreTrainedConfig | dict | None = None + freeze_backbone_batch_norms: bool = True + encoder_hidden_dim: int = 256 + encoder_in_channels: list[int] | tuple[int, ...] | None = (512, 1024, 2048) + feat_strides: list[int] | tuple[int, ...] | None = (8, 16, 32) + encoder_layers: int = 1 + encoder_ffn_dim: int = 1024 + encoder_attention_heads: int = 8 + dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + encode_proj_layers: list[int] | tuple[int, ...] | None = (2,) + positional_encoding_temperature: int = 10000 + encoder_activation_function: str = "gelu" + activation_function: str = "silu" + eval_size: list[int] | None = None + normalize_before: bool = False + hidden_expansion: float = 1.0 + d_model: int = 256 + num_queries: int = 300 + decoder_in_channels: list[int] | tuple[int, ...] | None = (256, 256, 256) + decoder_ffn_dim: int = 1024 + num_feature_levels: int = 3 + decoder_n_points: int = 4 + decoder_layers: int = 6 + decoder_attention_heads: int = 8 + decoder_activation_function: str = "relu" + attention_dropout: float | int = 0.0 + num_denoising: int = 100 + label_noise_ratio: float = 0.5 + box_noise_scale: float = 1.0 + learn_initial_query: bool = False + anchor_image_size: list[int] | None = None + disable_custom_kernels: bool = True + is_encoder_decoder: bool = True + class_thresholds: list[float] | None = None + class_order: list[int] | None = None + reading_order_config: PreTrainedConfig | dict | None = None - if isinstance(reading_order_config, dict): - self.reading_order_config = self.sub_configs["reading_order_config"](**reading_order_config) - elif reading_order_config is None: + def __post_init__(self, **kwargs): + if isinstance(self.reading_order_config, dict): + self.reading_order_config = self.sub_configs["reading_order_config"](**self.reading_order_config) + elif self.reading_order_config is None: self.reading_order_config = self.sub_configs["reading_order_config"]() - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="hgnet_v2", default_config_kwargs={ "arch": "L", @@ -297,48 +253,14 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - self.freeze_backbone_batch_norms = freeze_backbone_batch_norms - - # ---- encoder ---- - self.encoder_hidden_dim = encoder_hidden_dim - self.encoder_in_channels = list(encoder_in_channels) - self.feat_strides = list(feat_strides) - self.encoder_layers = encoder_layers - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_attention_heads = encoder_attention_heads - self.dropout = dropout - self.activation_dropout = activation_dropout - self.encode_proj_layers = list(encode_proj_layers) - self.positional_encoding_temperature = positional_encoding_temperature - self.encoder_activation_function = encoder_activation_function - self.activation_function = activation_function - self.eval_size = list(eval_size) if eval_size is not None else None - self.normalize_before = normalize_before - self.hidden_expansion = hidden_expansion - - # ---- decoder ---- - self.d_model = d_model - self.num_queries = num_queries - self.decoder_in_channels = list(decoder_in_channels) - self.decoder_ffn_dim = decoder_ffn_dim - self.num_feature_levels = num_feature_levels - self.decoder_n_points = decoder_n_points - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.decoder_activation_function = decoder_activation_function - self.attention_dropout = attention_dropout - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.box_noise_scale = box_noise_scale - self.learn_initial_query = learn_initial_query - self.anchor_image_size = list(anchor_image_size) if anchor_image_size is not None else None - self.disable_custom_kernels = disable_custom_kernels - - self.class_thresholds = class_thresholds - self.class_order = class_order + self.encoder_in_channels = list(self.encoder_in_channels) + self.feat_strides = list(self.feat_strides) + self.encode_proj_layers = list(self.encode_proj_layers) + self.eval_size = list(self.eval_size) if self.eval_size is not None else None + self.decoder_in_channels = list(self.decoder_in_channels) + self.anchor_image_size = list(self.anchor_image_size) if self.anchor_image_size is not None else None - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__post_init__(**kwargs) __all__ = ["PPDocLayoutV2Config"] diff --git a/src/transformers/models/pp_doclayout_v2/modular_pp_doclayout_v2.py b/src/transformers/models/pp_doclayout_v2/modular_pp_doclayout_v2.py index 6af11ca6ebbc..685e576579f9 100644 --- a/src/transformers/models/pp_doclayout_v2/modular_pp_doclayout_v2.py +++ b/src/transformers/models/pp_doclayout_v2/modular_pp_doclayout_v2.py @@ -17,6 +17,7 @@ from typing import Optional import torch +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -58,6 +59,7 @@ @auto_docstring(checkpoint="PaddlePaddle/PP-DocLayoutV2_safetensors") +@strict(accept_kwargs=True) class PPDocLayoutV2ReadingOrderConfig(PreTrainedConfig): r""" has_relative_attention_bias (`bool`, *optional*, defaults to `True`): @@ -99,77 +101,41 @@ class PPDocLayoutV2ReadingOrderConfig(PreTrainedConfig): The dropout probability in the global pointer head. """ - def __init__( - self, - hidden_size=512, - num_attention_heads=8, - attention_probs_dropout_prob=0.1, - has_relative_attention_bias=False, - has_spatial_attention_bias=True, - layer_norm_eps=1e-5, - hidden_dropout_prob=0.1, - intermediate_size=2048, - hidden_act="gelu", - num_hidden_layers=6, - rel_pos_bins=32, - max_rel_pos=128, - rel_2d_pos_bins=64, - max_rel_2d_pos=256, - max_position_embeddings=514, - max_2d_position_embeddings=1024, - type_vocab_size=1, - vocab_size=4, - initializer_range=0.01, - start_token_id=0, - pad_token_id=1, - end_token_id=2, - pred_token_id=3, - coordinate_size=171, - shape_size=170, - num_classes=20, - relation_bias_embed_dim=16, - relation_bias_theta=10000, - relation_bias_scale=100, - global_pointer_head_size=64, - gp_dropout_value=0.0, - **kwargs, - ): - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.has_relative_attention_bias = has_relative_attention_bias - self.has_spatial_attention_bias = has_spatial_attention_bias - self.layer_norm_eps = layer_norm_eps - self.hidden_dropout_prob = hidden_dropout_prob - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_hidden_layers = num_hidden_layers - self.rel_pos_bins = rel_pos_bins - self.max_rel_pos = max_rel_pos - self.rel_2d_pos_bins = rel_2d_pos_bins - self.max_rel_2d_pos = max_rel_2d_pos - self.max_position_embeddings = max_position_embeddings - self.max_2d_position_embeddings = max_2d_position_embeddings - self.type_vocab_size = type_vocab_size - self.vocab_size = vocab_size - self.initializer_range = initializer_range - self.start_token_id = start_token_id - self.pad_token_id = pad_token_id - self.end_token_id = end_token_id - self.pred_token_id = pred_token_id - self.coordinate_size = coordinate_size - self.shape_size = shape_size - self.num_classes = num_classes - self.relation_bias_embed_dim = relation_bias_embed_dim - self.relation_bias_theta = relation_bias_theta - self.relation_bias_scale = relation_bias_scale - self.global_pointer_head_size = global_pointer_head_size - self.gp_dropout_value = gp_dropout_value - - super().__init__(**kwargs) + hidden_size: int = 512 + num_attention_heads: int = 8 + attention_probs_dropout_prob: float = 0.1 + has_relative_attention_bias: bool = False + has_spatial_attention_bias: bool = True + layer_norm_eps: float = 1e-5 + hidden_dropout_prob: float = 0.1 + intermediate_size: int = 2048 + hidden_act: str = "gelu" + num_hidden_layers: int = 6 + rel_pos_bins: int = 32 + max_rel_pos: int = 128 + rel_2d_pos_bins: int = 64 + max_rel_2d_pos: int = 256 + max_position_embeddings: int = 514 + max_2d_position_embeddings: int = 1024 + type_vocab_size: int = 1 + vocab_size: int = 4 + initializer_range: float = 0.01 + start_token_id: int = 0 + pad_token_id: int | None = 1 + end_token_id: int = 2 + pred_token_id: int = 3 + coordinate_size: int = 171 + shape_size: int = 170 + num_classes: int = 20 + relation_bias_embed_dim: int = 16 + relation_bias_theta: int = 10000 + relation_bias_scale: int = 100 + global_pointer_head_size: int = 64 + gp_dropout_value: float = 0.0 @auto_docstring(checkpoint="PaddlePaddle/PP-DocLayoutV2_safetensors") +@strict(accept_kwargs=True) class PPDocLayoutV2Config(PreTrainedConfig): r""" initializer_bias_prior_prob (`float`, *optional*): @@ -256,67 +222,56 @@ class PPDocLayoutV2Config(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - initializer_range=0.01, - initializer_bias_prior_prob=None, - layer_norm_eps=1e-5, - batch_norm_eps=1e-5, - # backbone - backbone_config=None, - freeze_backbone_batch_norms=True, - # encoder HybridEncoder - encoder_hidden_dim=256, - encoder_in_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - encoder_layers=1, - encoder_ffn_dim=1024, - encoder_attention_heads=8, - dropout=0.0, - activation_dropout=0.0, - encode_proj_layers=[2], - positional_encoding_temperature=10000, - encoder_activation_function="gelu", - activation_function="silu", - eval_size=None, - normalize_before=False, - hidden_expansion=1.0, - # decoder PPDocLayoutV2Transformer - d_model=256, - num_queries=300, - decoder_in_channels=[256, 256, 256], - decoder_ffn_dim=1024, - num_feature_levels=3, - decoder_n_points=4, - decoder_layers=6, - decoder_attention_heads=8, - decoder_activation_function="relu", - attention_dropout=0.0, - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0, - learn_initial_query=False, - anchor_image_size=None, - disable_custom_kernels=True, - is_encoder_decoder=True, - # label - class_thresholds=None, - class_order=None, - reading_order_config=None, - **kwargs, - ): - self.initializer_range = initializer_range - self.initializer_bias_prior_prob = initializer_bias_prior_prob - self.layer_norm_eps = layer_norm_eps - self.batch_norm_eps = batch_norm_eps - - if isinstance(reading_order_config, dict): - self.reading_order_config = self.sub_configs["reading_order_config"](**reading_order_config) - elif reading_order_config is None: + initializer_range: float = 0.01 + initializer_bias_prior_prob: float | None = None + layer_norm_eps: float = 1e-5 + batch_norm_eps: float = 1e-5 + backbone_config: PreTrainedConfig | dict | None = None + freeze_backbone_batch_norms: bool = True + encoder_hidden_dim: int = 256 + encoder_in_channels: list[int] | tuple[int, ...] | None = (512, 1024, 2048) + feat_strides: list[int] | tuple[int, ...] | None = (8, 16, 32) + encoder_layers: int = 1 + encoder_ffn_dim: int = 1024 + encoder_attention_heads: int = 8 + dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + encode_proj_layers: list[int] | tuple[int, ...] | None = (2,) + positional_encoding_temperature: int = 10000 + encoder_activation_function: str = "gelu" + activation_function: str = "silu" + eval_size: list[int] | None = None + normalize_before: bool = False + hidden_expansion: float = 1.0 + d_model: int = 256 + num_queries: int = 300 + decoder_in_channels: list[int] | tuple[int, ...] | None = (256, 256, 256) + decoder_ffn_dim: int = 1024 + num_feature_levels: int = 3 + decoder_n_points: int = 4 + decoder_layers: int = 6 + decoder_attention_heads: int = 8 + decoder_activation_function: str = "relu" + attention_dropout: float | int = 0.0 + num_denoising: int = 100 + label_noise_ratio: float = 0.5 + box_noise_scale: float = 1.0 + learn_initial_query: bool = False + anchor_image_size: list[int] | None = None + disable_custom_kernels: bool = True + is_encoder_decoder: bool = True + class_thresholds: list[float] | None = None + class_order: list[int] | None = None + reading_order_config: PreTrainedConfig | dict | None = None + + def __post_init__(self, **kwargs): + if isinstance(self.reading_order_config, dict): + self.reading_order_config = self.sub_configs["reading_order_config"](**self.reading_order_config) + elif self.reading_order_config is None: self.reading_order_config = self.sub_configs["reading_order_config"]() - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="hgnet_v2", default_config_kwargs={ "arch": "L", @@ -330,48 +285,14 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - self.freeze_backbone_batch_norms = freeze_backbone_batch_norms - - # ---- encoder ---- - self.encoder_hidden_dim = encoder_hidden_dim - self.encoder_in_channels = list(encoder_in_channels) - self.feat_strides = list(feat_strides) - self.encoder_layers = encoder_layers - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_attention_heads = encoder_attention_heads - self.dropout = dropout - self.activation_dropout = activation_dropout - self.encode_proj_layers = list(encode_proj_layers) - self.positional_encoding_temperature = positional_encoding_temperature - self.encoder_activation_function = encoder_activation_function - self.activation_function = activation_function - self.eval_size = list(eval_size) if eval_size is not None else None - self.normalize_before = normalize_before - self.hidden_expansion = hidden_expansion - - # ---- decoder ---- - self.d_model = d_model - self.num_queries = num_queries - self.decoder_in_channels = list(decoder_in_channels) - self.decoder_ffn_dim = decoder_ffn_dim - self.num_feature_levels = num_feature_levels - self.decoder_n_points = decoder_n_points - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.decoder_activation_function = decoder_activation_function - self.attention_dropout = attention_dropout - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.box_noise_scale = box_noise_scale - self.learn_initial_query = learn_initial_query - self.anchor_image_size = list(anchor_image_size) if anchor_image_size is not None else None - self.disable_custom_kernels = disable_custom_kernels - - self.class_thresholds = class_thresholds - self.class_order = class_order - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + self.encoder_in_channels = list(self.encoder_in_channels) + self.feat_strides = list(self.feat_strides) + self.encode_proj_layers = list(self.encode_proj_layers) + self.eval_size = list(self.eval_size) if self.eval_size is not None else None + self.decoder_in_channels = list(self.decoder_in_channels) + self.anchor_image_size = list(self.anchor_image_size) if self.anchor_image_size is not None else None + + super().__post_init__(**kwargs) class PPDocLayoutV2ImageProcessorFast(PPDocLayoutV3ImageProcessorFast): diff --git a/src/transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py b/src/transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py index 4736273e7bdf..4404f3808ac5 100644 --- a/src/transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +++ b/src/transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py @@ -18,6 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -25,6 +27,7 @@ @auto_docstring(checkpoint="PaddlePaddle/PP-DocLayoutV3_safetensors") +@strict(accept_kwargs=True) class PPDocLayoutV3Config(PreTrainedConfig): r""" initializer_bias_prior_prob (`float`, *optional*): @@ -117,66 +120,55 @@ class PPDocLayoutV3Config(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - initializer_range=0.01, - initializer_bias_prior_prob=None, - layer_norm_eps=1e-5, - batch_norm_eps=1e-5, - tie_word_embeddings=True, - # backbone - backbone_config=None, - freeze_backbone_batch_norms=True, - # encoder PPDocLayoutV3HybridEncoder - encoder_hidden_dim=256, - encoder_in_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - encoder_layers=1, - encoder_ffn_dim=1024, - encoder_attention_heads=8, - dropout=0.0, - activation_dropout=0.0, - encode_proj_layers=[2], - positional_encoding_temperature=10000, - encoder_activation_function="gelu", - activation_function="silu", - eval_size=None, - normalize_before=False, - hidden_expansion=1.0, - mask_feature_channels=[64, 64], - x4_feat_dim=128, - # decoder PPDocLayoutV3Transformer - d_model=256, - num_prototypes=32, - label_noise_ratio=0.4, - box_noise_scale=0.4, - mask_enhanced=True, - num_queries=300, - decoder_in_channels=[256, 256, 256], - decoder_ffn_dim=1024, - num_feature_levels=3, - decoder_n_points=4, - decoder_layers=6, - decoder_attention_heads=8, - decoder_activation_function="relu", - attention_dropout=0.0, - num_denoising=100, - learn_initial_query=False, - anchor_image_size=None, - disable_custom_kernels=True, - is_encoder_decoder=True, - global_pointer_head_size=64, - gp_dropout_value=0.1, - **kwargs, - ): - self.initializer_range = initializer_range - self.initializer_bias_prior_prob = initializer_bias_prior_prob - self.layer_norm_eps = layer_norm_eps - self.batch_norm_eps = batch_norm_eps - self.tie_word_embeddings = tie_word_embeddings - - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + initializer_range: float = 0.01 + initializer_bias_prior_prob: float | None = None + layer_norm_eps: float = 1e-5 + batch_norm_eps: float = 1e-5 + tie_word_embeddings: bool = True + backbone_config: dict | PreTrainedConfig | None = None + freeze_backbone_batch_norms: bool = True + encoder_hidden_dim: int = 256 + encoder_in_channels: list[int] | tuple[int, ...] = (512, 1024, 2048) + feat_strides: list[int] | tuple[int, ...] = (8, 16, 32) + encoder_layers: int = 1 + encoder_ffn_dim: int = 1024 + encoder_attention_heads: int = 8 + dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + encode_proj_layers: list[int] | tuple[int, ...] = (2,) + positional_encoding_temperature: int = 10000 + encoder_activation_function: str = "gelu" + activation_function: str = "silu" + eval_size: int | None = None + normalize_before: bool = False + hidden_expansion: float = 1.0 + mask_feature_channels: list[int] | tuple[int, ...] = (64, 64) + x4_feat_dim: int = 128 + d_model: int = 256 + num_prototypes: int = 32 + label_noise_ratio: float = 0.4 + box_noise_scale: float = 0.4 + mask_enhanced: bool = True + num_queries: int = 300 + decoder_in_channels: list[int] | tuple[int, ...] = (256, 256, 256) + decoder_ffn_dim: int = 1024 + num_feature_levels: int = 3 + decoder_n_points: int = 4 + decoder_layers: int = 6 + decoder_attention_heads: int = 8 + decoder_activation_function: str = "relu" + attention_dropout: float | int = 0.0 + num_denoising: int = 100 + learn_initial_query: bool = False + anchor_image_size: int | None = None + disable_custom_kernels: bool = True + is_encoder_decoder: bool = True + global_pointer_head_size: int = 64 + gp_dropout_value: float = 0.1 + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="hgnet_v2", default_config_kwargs={ "arch": "L", @@ -190,51 +182,13 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - self.freeze_backbone_batch_norms = freeze_backbone_batch_norms - - # ---- encoder ---- - self.encoder_hidden_dim = encoder_hidden_dim - self.encoder_in_channels = list(encoder_in_channels) - self.feat_strides = list(feat_strides) - self.encoder_layers = encoder_layers - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_attention_heads = encoder_attention_heads - self.dropout = dropout - self.activation_dropout = activation_dropout - self.encode_proj_layers = list(encode_proj_layers) - self.positional_encoding_temperature = positional_encoding_temperature - self.encoder_activation_function = encoder_activation_function - self.activation_function = activation_function - self.eval_size = list(eval_size) if eval_size is not None else None - self.normalize_before = normalize_before - self.hidden_expansion = hidden_expansion - self.mask_feature_channels = mask_feature_channels - self.x4_feat_dim = x4_feat_dim - - # ---- decoder ---- - self.d_model = d_model - self.num_queries = num_queries - self.num_prototypes = num_prototypes - self.decoder_in_channels = list(decoder_in_channels) - self.decoder_ffn_dim = decoder_ffn_dim - self.num_feature_levels = num_feature_levels - self.decoder_n_points = decoder_n_points - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.decoder_activation_function = decoder_activation_function - self.attention_dropout = attention_dropout - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.mask_enhanced = mask_enhanced - self.box_noise_scale = box_noise_scale - self.learn_initial_query = learn_initial_query - self.anchor_image_size = list(anchor_image_size) if anchor_image_size is not None else None - self.disable_custom_kernels = disable_custom_kernels - self.global_pointer_head_size = global_pointer_head_size - self.gp_dropout_value = gp_dropout_value - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + self.encoder_in_channels = list(self.encoder_in_channels) + self.feat_strides = list(self.feat_strides) + self.encode_proj_layers = list(self.encode_proj_layers) + self.eval_size = list(self.eval_size) if self.eval_size is not None else None + self.decoder_in_channels = list(self.decoder_in_channels) + self.anchor_image_size = list(self.anchor_image_size) if self.anchor_image_size is not None else None + super().__post_init__(**kwargs) __all__ = ["PPDocLayoutV3Config"] diff --git a/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py b/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py index 9e82be44d988..f3f6be670649 100644 --- a/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +++ b/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py @@ -20,6 +20,7 @@ import torch import torch.nn.functional as F import torchvision.transforms.v2.functional as tvF +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -70,6 +71,7 @@ @auto_docstring(checkpoint="PaddlePaddle/PP-DocLayoutV3_safetensors") +@strict(accept_kwargs=True) class PPDocLayoutV3Config(PreTrainedConfig): r""" initializer_bias_prior_prob (`float`, *optional*): @@ -162,66 +164,55 @@ class PPDocLayoutV3Config(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - initializer_range=0.01, - initializer_bias_prior_prob=None, - layer_norm_eps=1e-5, - batch_norm_eps=1e-5, - tie_word_embeddings=True, - # backbone - backbone_config=None, - freeze_backbone_batch_norms=True, - # encoder PPDocLayoutV3HybridEncoder - encoder_hidden_dim=256, - encoder_in_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - encoder_layers=1, - encoder_ffn_dim=1024, - encoder_attention_heads=8, - dropout=0.0, - activation_dropout=0.0, - encode_proj_layers=[2], - positional_encoding_temperature=10000, - encoder_activation_function="gelu", - activation_function="silu", - eval_size=None, - normalize_before=False, - hidden_expansion=1.0, - mask_feature_channels=[64, 64], - x4_feat_dim=128, - # decoder PPDocLayoutV3Transformer - d_model=256, - num_prototypes=32, - label_noise_ratio=0.4, - box_noise_scale=0.4, - mask_enhanced=True, - num_queries=300, - decoder_in_channels=[256, 256, 256], - decoder_ffn_dim=1024, - num_feature_levels=3, - decoder_n_points=4, - decoder_layers=6, - decoder_attention_heads=8, - decoder_activation_function="relu", - attention_dropout=0.0, - num_denoising=100, - learn_initial_query=False, - anchor_image_size=None, - disable_custom_kernels=True, - is_encoder_decoder=True, - global_pointer_head_size=64, - gp_dropout_value=0.1, - **kwargs, - ): - self.initializer_range = initializer_range - self.initializer_bias_prior_prob = initializer_bias_prior_prob - self.layer_norm_eps = layer_norm_eps - self.batch_norm_eps = batch_norm_eps - self.tie_word_embeddings = tie_word_embeddings - - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + initializer_range: float = 0.01 + initializer_bias_prior_prob: float | None = None + layer_norm_eps: float = 1e-5 + batch_norm_eps: float = 1e-5 + tie_word_embeddings: bool = True + backbone_config: dict | PreTrainedConfig | None = None + freeze_backbone_batch_norms: bool = True + encoder_hidden_dim: int = 256 + encoder_in_channels: list[int] | tuple[int, ...] = (512, 1024, 2048) + feat_strides: list[int] | tuple[int, ...] = (8, 16, 32) + encoder_layers: int = 1 + encoder_ffn_dim: int = 1024 + encoder_attention_heads: int = 8 + dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + encode_proj_layers: list[int] | tuple[int, ...] = (2,) + positional_encoding_temperature: int = 10000 + encoder_activation_function: str = "gelu" + activation_function: str = "silu" + eval_size: int | None = None + normalize_before: bool = False + hidden_expansion: float = 1.0 + mask_feature_channels: list[int] | tuple[int, ...] = (64, 64) + x4_feat_dim: int = 128 + d_model: int = 256 + num_prototypes: int = 32 + label_noise_ratio: float = 0.4 + box_noise_scale: float = 0.4 + mask_enhanced: bool = True + num_queries: int = 300 + decoder_in_channels: list[int] | tuple[int, ...] = (256, 256, 256) + decoder_ffn_dim: int = 1024 + num_feature_levels: int = 3 + decoder_n_points: int = 4 + decoder_layers: int = 6 + decoder_attention_heads: int = 8 + decoder_activation_function: str = "relu" + attention_dropout: float | int = 0.0 + num_denoising: int = 100 + learn_initial_query: bool = False + anchor_image_size: int | None = None + disable_custom_kernels: bool = True + is_encoder_decoder: bool = True + global_pointer_head_size: int = 64 + gp_dropout_value: float = 0.1 + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="hgnet_v2", default_config_kwargs={ "arch": "L", @@ -235,51 +226,13 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - self.freeze_backbone_batch_norms = freeze_backbone_batch_norms - - # ---- encoder ---- - self.encoder_hidden_dim = encoder_hidden_dim - self.encoder_in_channels = list(encoder_in_channels) - self.feat_strides = list(feat_strides) - self.encoder_layers = encoder_layers - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_attention_heads = encoder_attention_heads - self.dropout = dropout - self.activation_dropout = activation_dropout - self.encode_proj_layers = list(encode_proj_layers) - self.positional_encoding_temperature = positional_encoding_temperature - self.encoder_activation_function = encoder_activation_function - self.activation_function = activation_function - self.eval_size = list(eval_size) if eval_size is not None else None - self.normalize_before = normalize_before - self.hidden_expansion = hidden_expansion - self.mask_feature_channels = mask_feature_channels - self.x4_feat_dim = x4_feat_dim - - # ---- decoder ---- - self.d_model = d_model - self.num_queries = num_queries - self.num_prototypes = num_prototypes - self.decoder_in_channels = list(decoder_in_channels) - self.decoder_ffn_dim = decoder_ffn_dim - self.num_feature_levels = num_feature_levels - self.decoder_n_points = decoder_n_points - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.decoder_activation_function = decoder_activation_function - self.attention_dropout = attention_dropout - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.mask_enhanced = mask_enhanced - self.box_noise_scale = box_noise_scale - self.learn_initial_query = learn_initial_query - self.anchor_image_size = list(anchor_image_size) if anchor_image_size is not None else None - self.disable_custom_kernels = disable_custom_kernels - self.global_pointer_head_size = global_pointer_head_size - self.gp_dropout_value = gp_dropout_value - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + self.encoder_in_channels = list(self.encoder_in_channels) + self.feat_strides = list(self.feat_strides) + self.encode_proj_layers = list(self.encode_proj_layers) + self.eval_size = list(self.eval_size) if self.eval_size is not None else None + self.decoder_in_channels = list(self.decoder_in_channels) + self.anchor_image_size = list(self.anchor_image_size) if self.anchor_image_size is not None else None + super().__post_init__(**kwargs) @auto_docstring diff --git a/src/transformers/models/pp_lcnet/configuration_pp_lcnet.py b/src/transformers/models/pp_lcnet/configuration_pp_lcnet.py index 32b6dd1868fb..f905b0af4279 100644 --- a/src/transformers/models/pp_lcnet/configuration_pp_lcnet.py +++ b/src/transformers/models/pp_lcnet/configuration_pp_lcnet.py @@ -18,14 +18,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring -@auto_docstring( - checkpoint="PaddlePaddle/PP-LCNet_x1_0_doc_ori_safetensors", - custom_args=r""" +@auto_docstring(checkpoint="PaddlePaddle/PP-LCNet_x1_0_doc_ori_safetensors") +@strict(accept_kwargs=True) +class PPLCNetConfig(BackboneConfigMixin, PreTrainedConfig): + r""" scale (`float`, *optional*, defaults to 1.0): The scaling factor for the model's channel dimensions, used to adjust the model size and computational cost without changing the overall architecture (e.g., 0.25, 0.5, 1.0, 1.5). @@ -46,36 +49,23 @@ divisor (`int`, *optional*, defaults to 8): The divisor used to ensure that various model parameters (e.g., channel dimensions, kernel sizes) are multiples of this value, promoting efficient model implementation and resource utilization. - """, -) -class PPLCNetConfig(BackboneConfigMixin, PreTrainedConfig): + """ + model_type = "pp_lcnet" - def __init__( - self, - scale: float = 1.0, - block_configs: list | None = None, - stem_channels: int = 16, - stem_stride: int = 2, - reduction: int = 4, - class_expand: int = 1280, - divisor: int = 8, - hidden_act: str | None = "hardswish", - out_features: list | None = None, - out_indices: list | None = None, - hidden_dropout_prob: float = 0.2, - **kwargs, - ): - super().__init__(**kwargs) - self.scale = scale - self.hidden_act = hidden_act - self.stem_channels = stem_channels - self.stem_stride = stem_stride - self.reduction = reduction - self.hidden_dropout_prob = hidden_dropout_prob - self.class_expand = class_expand - self.divisor = divisor + scale: float | int = 1.0 + block_configs: list | None = None + stem_channels: int = 16 + stem_stride: int = 2 + reduction: int = 4 + class_expand: int = 1280 + divisor: int = 8 + hidden_act: str = "hardswish" + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + hidden_dropout_prob: float = 0.2 + def __post_init__(self, **kwargs): # Default block configs for PP-LCNet # Each tuple: (kernel_size, in_channels, out_channels, stride, use_squeeze_excitation) self.block_configs = ( @@ -98,14 +88,21 @@ def __init__( # Stage 5 (blocks6) [[5, 256, 512, 2, True], [5, 512, 512, 1, True]], ] - if block_configs is None - else block_configs + if self.block_configs is None + else self.block_configs ) - if len(self.block_configs) != 5: - raise ValueError(f"block_configs must have 5 stages, but got {len(self.block_configs)}") + self.depths = [len(blocks) for blocks in self.block_configs] self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.block_configs) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if len(self.block_configs) != 5: + raise ValueError(f"block_configs must have 5 stages, but got {len(self.block_configs)}") __all__ = ["PPLCNetConfig"] diff --git a/src/transformers/models/pp_lcnet/modular_pp_lcnet.py b/src/transformers/models/pp_lcnet/modular_pp_lcnet.py index 8144fa5e1239..6dd2ede050a5 100644 --- a/src/transformers/models/pp_lcnet/modular_pp_lcnet.py +++ b/src/transformers/models/pp_lcnet/modular_pp_lcnet.py @@ -18,6 +18,7 @@ import torch import torch.nn as nn import torchvision.transforms.v2.functional as tvF +from huggingface_hub.dataclasses import strict from ...activations import ACT2FN from ...backbone_utils import BackboneConfigMixin, BackboneMixin, filter_output_hidden_states @@ -46,9 +47,10 @@ from ..resnet.modeling_resnet import ResNetConvLayer -@auto_docstring( - checkpoint="PaddlePaddle/PP-LCNet_x1_0_doc_ori_safetensors", - custom_args=r""" +@auto_docstring(checkpoint="PaddlePaddle/PP-LCNet_x1_0_doc_ori_safetensors") +@strict(accept_kwargs=True) +class PPLCNetConfig(BackboneConfigMixin, PreTrainedConfig): + r""" scale (`float`, *optional*, defaults to 1.0): The scaling factor for the model's channel dimensions, used to adjust the model size and computational cost without changing the overall architecture (e.g., 0.25, 0.5, 1.0, 1.5). @@ -69,36 +71,23 @@ divisor (`int`, *optional*, defaults to 8): The divisor used to ensure that various model parameters (e.g., channel dimensions, kernel sizes) are multiples of this value, promoting efficient model implementation and resource utilization. - """, -) -class PPLCNetConfig(BackboneConfigMixin, PreTrainedConfig): - model_type = "pp_lcnet" + """ - def __init__( - self, - scale: float = 1.0, - block_configs: list | None = None, - stem_channels: int = 16, - stem_stride: int = 2, - reduction: int = 4, - class_expand: int = 1280, - divisor: int = 8, - hidden_act: str | None = "hardswish", - out_features: list | None = None, - out_indices: list | None = None, - hidden_dropout_prob: float = 0.2, - **kwargs, - ): - super().__init__(**kwargs) - self.scale = scale - self.hidden_act = hidden_act - self.stem_channels = stem_channels - self.stem_stride = stem_stride - self.reduction = reduction - self.hidden_dropout_prob = hidden_dropout_prob - self.class_expand = class_expand - self.divisor = divisor + model_type = "pp_lcnet" + scale: float | int = 1.0 + block_configs: list | None = None + stem_channels: int = 16 + stem_stride: int = 2 + reduction: int = 4 + class_expand: int = 1280 + divisor: int = 8 + hidden_act: str = "hardswish" + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + hidden_dropout_prob: float = 0.2 + + def __post_init__(self, **kwargs): # Default block configs for PP-LCNet # Each tuple: (kernel_size, in_channels, out_channels, stride, use_squeeze_excitation) self.block_configs = ( @@ -121,14 +110,21 @@ def __init__( # Stage 5 (blocks6) [[5, 256, 512, 2, True], [5, 512, 512, 1, True]], ] - if block_configs is None - else block_configs + if self.block_configs is None + else self.block_configs ) - if len(self.block_configs) != 5: - raise ValueError(f"block_configs must have 5 stages, but got {len(self.block_configs)}") + self.depths = [len(blocks) for blocks in self.block_configs] self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.block_configs) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if len(self.block_configs) != 5: + raise ValueError(f"block_configs must have 5 stages, but got {len(self.block_configs)}") class PPLCNetImageProcessorKwargs(ImagesKwargs, total=False): diff --git a/src/transformers/models/pp_lcnet_v3/configuration_pp_lcnet_v3.py b/src/transformers/models/pp_lcnet_v3/configuration_pp_lcnet_v3.py index 2e47bd612db9..02e764fdde71 100644 --- a/src/transformers/models/pp_lcnet_v3/configuration_pp_lcnet_v3.py +++ b/src/transformers/models/pp_lcnet_v3/configuration_pp_lcnet_v3.py @@ -18,14 +18,18 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring -@auto_docstring( - checkpoint="PaddlePaddle/Not_yet_released", - custom_args=r""" +@auto_docstring(checkpoint="PaddlePaddle/Not_yet_released") +@strict(accept_kwargs=True) +class PPLCNetV3Config(BackboneConfigMixin, PreTrainedConfig): + r""" scale (`float`, *optional*, defaults to 1.0): The scaling factor for the model's channel dimensions, used to adjust the model size and computational cost without changing the overall architecture (e.g., 0.25, 0.5, 1.0, 1.5). @@ -50,32 +54,23 @@ The number of kxk convolution branches in the learnable reparameterization layer, used to enhance feature extraction capability through multi-branch architecture during training while enabling efficient inference via structural reparameterization. - """, -) -class PPLCNetV3Config(BackboneConfigMixin, PreTrainedConfig): + """ + model_type = "pp_lcnet_v3" - def __init__( - self, - scale: float = 1.0, - hidden_act: str | None = "hardswish", - out_features: list | None = None, - out_indices: list | None = None, - stem_channels: int = 16, - stem_stride: int = 2, - block_configs: list | None = None, - reduction: int = 4, - divisor: int = 8, - conv_symmetric_num: int = 4, - **kwargs, - ): - super().__init__(**kwargs) - self.scale = scale - self.hidden_act = hidden_act - self.stem_channels = stem_channels - self.stem_stride = stem_stride - self.reduction = reduction - self.divisor = divisor + scale: float | int = 1.0 + block_configs: list | None = None + stem_channels: int = 16 + stem_stride: int = 2 + reduction: int = 4 + divisor: int = 8 + hidden_act: str = "hardswish" + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + + conv_symmetric_num: int = 4 + + def __post_init__(self, **kwargs): # Default block configs for PP-LCNetV3 # Each tuple: (kernel_size, in_channels, out_channels, stride, use_squeeze_excitation) self.block_configs = ( @@ -97,15 +92,20 @@ def __init__( # Stage 5 (blocks6) [[5, 256, 512, 2, True], [5, 512, 512, 1, True], [5, 512, 512, 1, False], [5, 512, 512, 1, False]], ] - if block_configs is None - else block_configs + if self.block_configs is None + else self.block_configs ) - if len(self.block_configs) != 5: - raise ValueError(f"block_configs must have 5 stages, but got {len(self.block_configs)}") self.depths = [len(blocks) for blocks in self.block_configs] self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.block_configs) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) - self.conv_symmetric_num = conv_symmetric_num + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if len(self.block_configs) != 5: + raise ValueError(f"block_configs must have 5 stages, but got {len(self.block_configs)}") __all__ = ["PPLCNetV3Config"] diff --git a/src/transformers/models/pp_lcnet_v3/modeling_pp_lcnet_v3.py b/src/transformers/models/pp_lcnet_v3/modeling_pp_lcnet_v3.py index 4c023dbf8d37..22e0dff2bad3 100644 --- a/src/transformers/models/pp_lcnet_v3/modeling_pp_lcnet_v3.py +++ b/src/transformers/models/pp_lcnet_v3/modeling_pp_lcnet_v3.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import torch import torch.nn as nn from torch import Tensor diff --git a/src/transformers/models/pp_lcnet_v3/modular_pp_lcnet_v3.py b/src/transformers/models/pp_lcnet_v3/modular_pp_lcnet_v3.py index e8f8b3aaae90..007366de70b3 100644 --- a/src/transformers/models/pp_lcnet_v3/modular_pp_lcnet_v3.py +++ b/src/transformers/models/pp_lcnet_v3/modular_pp_lcnet_v3.py @@ -12,10 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. + import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ...activations import ACT2FN +from ...configuration_utils import PreTrainedConfig from ...modeling_utils import PreTrainedModel from ...utils import ( auto_docstring, @@ -32,9 +35,10 @@ ) -@auto_docstring( - checkpoint="PaddlePaddle/Not_yet_released", - custom_args=r""" +@auto_docstring(checkpoint="PaddlePaddle/Not_yet_released") +@strict(accept_kwargs=True) +class PPLCNetV3Config(PPLCNetConfig): + r""" scale (`float`, *optional*, defaults to 1.0): The scaling factor for the model's channel dimensions, used to adjust the model size and computational cost without changing the overall architecture (e.g., 0.25, 0.5, 1.0, 1.5). @@ -59,29 +63,15 @@ The number of kxk convolution branches in the learnable reparameterization layer, used to enhance feature extraction capability through multi-branch architecture during training while enabling efficient inference via structural reparameterization. - """, -) -class PPLCNetV3Config(PPLCNetConfig): + """ + model_type = "pp_lcnet_v3" - def __init__( - self, - scale: float = 1.0, - hidden_act: str | None = "hardswish", - out_features: list | None = None, - out_indices: list | None = None, - stem_channels: int = 16, - stem_stride: int = 2, - block_configs: list | None = None, - reduction: int = 4, - divisor: int = 8, - conv_symmetric_num: int = 4, - **kwargs, - ): - super().__init__(**kwargs) - del self.hidden_dropout_prob - del self.class_expand - self.conv_symmetric_num = conv_symmetric_num + conv_symmetric_num: int = 4 + hidden_dropout_prob = AttributeError() + class_expand = AttributeError() + + def __post_init__(self, **kwargs): # Default block configs for PP-LCNetV3 # Each tuple: (kernel_size, in_channels, out_channels, stride, use_squeeze_excitation) self.block_configs = ( @@ -103,9 +93,15 @@ def __init__( # Stage 5 (blocks6) [[5, 256, 512, 2, True], [5, 512, 512, 1, True], [5, 512, 512, 1, False], [5, 512, 512, 1, False]], ] - if block_configs is None - else block_configs + if self.block_configs is None + else self.block_configs + ) + self.depths = [len(blocks) for blocks in self.block_configs] + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.block_configs) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) ) + PreTrainedConfig.__post_init__(**kwargs) class PPLCNetV3ConvLayer(PPLCNetConvLayer): diff --git a/src/transformers/models/pp_ocrv5_mobile_det/configuration_pp_ocrv5_mobile_det.py b/src/transformers/models/pp_ocrv5_mobile_det/configuration_pp_ocrv5_mobile_det.py index 812531156d3e..26c8aa6481a2 100644 --- a/src/transformers/models/pp_ocrv5_mobile_det/configuration_pp_ocrv5_mobile_det.py +++ b/src/transformers/models/pp_ocrv5_mobile_det/configuration_pp_ocrv5_mobile_det.py @@ -19,15 +19,18 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import AutoConfig -@auto_docstring( - checkpoint="PaddlePaddle/PP-OCRv5_mobile_det_safetensors", - custom_args=r""" +@auto_docstring(checkpoint="PaddlePaddle/PP-OCRv5_mobile_det_safetensors") +@strict(accept_kwargs=True) +class PPOCRV5MobileDetConfig(PreTrainedConfig): + r""" reduction (`int`, *optional*, defaults to 4): The reduction factor for feature channel dimensions, used to reduce the number of model parameters and computational complexity while maintaining feature representability. @@ -43,25 +46,21 @@ layer_list_out_channels (`List[int]`, *optional*, defaults to `[12, 18, 42, 360]`): The list of output channels for each backbone stage, used to configure the input channels of the RSE layers in the neck network for multi-scale feature fusion. - """, -) -class PPOCRV5MobileDetConfig(PreTrainedConfig): + """ + model_type = "pp_ocrv5_mobile_det" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - backbone_config=None, - reduction=4, - neck_out_channels=96, - interpolate_mode="nearest", - kernel_list=[3, 2, 2], - layer_list_out_channels=[12, 18, 42, 360], - **kwargs, - ): - # ---- Backbone ---- - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + reduction: int = 4 + neck_out_channels: int = 96 + interpolate_mode: str = "nearest" + kernel_list: list[int] | tuple[int, ...] = (3, 2, 2) + layer_list_out_channels: list[int] | tuple[int, ...] = (12, 18, 42, 360) + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="pp_lcnet_v3", default_config_kwargs={ "scale": 0.75, @@ -71,18 +70,7 @@ def __init__( }, **kwargs, ) - self.backbone_config = backbone_config - self.reduction = reduction - - # ---- Neck ---- - self.neck_out_channels = neck_out_channels - self.interpolate_mode = interpolate_mode - - # ---- Head ---- - self.kernel_list = kernel_list - self.layer_list_out_channels = layer_list_out_channels - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["PPOCRV5MobileDetConfig"] diff --git a/src/transformers/models/pp_ocrv5_mobile_det/modular_pp_ocrv5_mobile_det.py b/src/transformers/models/pp_ocrv5_mobile_det/modular_pp_ocrv5_mobile_det.py index 739cc94e616a..fe1c78df940b 100644 --- a/src/transformers/models/pp_ocrv5_mobile_det/modular_pp_ocrv5_mobile_det.py +++ b/src/transformers/models/pp_ocrv5_mobile_det/modular_pp_ocrv5_mobile_det.py @@ -16,6 +16,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ...activations import ACT2FN from ...backbone_utils import consolidate_backbone_kwargs_to_config, load_backbone @@ -40,9 +41,10 @@ logger = logging.get_logger(__name__) -@auto_docstring( - checkpoint="PaddlePaddle/PP-OCRv5_mobile_det_safetensors", - custom_args=r""" +@auto_docstring(checkpoint="PaddlePaddle/PP-OCRv5_mobile_det_safetensors") +@strict(accept_kwargs=True) +class PPOCRV5MobileDetConfig(PreTrainedConfig): + r""" reduction (`int`, *optional*, defaults to 4): The reduction factor for feature channel dimensions, used to reduce the number of model parameters and computational complexity while maintaining feature representability. @@ -58,25 +60,21 @@ layer_list_out_channels (`List[int]`, *optional*, defaults to `[12, 18, 42, 360]`): The list of output channels for each backbone stage, used to configure the input channels of the RSE layers in the neck network for multi-scale feature fusion. - """, -) -class PPOCRV5MobileDetConfig(PreTrainedConfig): + """ + model_type = "pp_ocrv5_mobile_det" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - backbone_config=None, - reduction=4, - neck_out_channels=96, - interpolate_mode="nearest", - kernel_list=[3, 2, 2], - layer_list_out_channels=[12, 18, 42, 360], - **kwargs, - ): - # ---- Backbone ---- - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + reduction: int = 4 + neck_out_channels: int = 96 + interpolate_mode: str = "nearest" + kernel_list: list[int] | tuple[int, ...] = (3, 2, 2) + layer_list_out_channels: list[int] | tuple[int, ...] = (12, 18, 42, 360) + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="pp_lcnet_v3", default_config_kwargs={ "scale": 0.75, @@ -86,18 +84,7 @@ def __init__( }, **kwargs, ) - self.backbone_config = backbone_config - self.reduction = reduction - - # ---- Neck ---- - self.neck_out_channels = neck_out_channels - self.interpolate_mode = interpolate_mode - - # ---- Head ---- - self.kernel_list = kernel_list - self.layer_list_out_channels = layer_list_out_channels - - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring diff --git a/src/transformers/models/pp_ocrv5_server_det/configuration_pp_ocrv5_server_det.py b/src/transformers/models/pp_ocrv5_server_det/configuration_pp_ocrv5_server_det.py index 9646490812fd..182d27032084 100644 --- a/src/transformers/models/pp_ocrv5_server_det/configuration_pp_ocrv5_server_det.py +++ b/src/transformers/models/pp_ocrv5_server_det/configuration_pp_ocrv5_server_det.py @@ -18,15 +18,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import AutoConfig -@auto_docstring( - checkpoint="PaddlePaddle/PP-OCRv5-server-det", - custom_args=r""" +@auto_docstring(checkpoint="PaddlePaddle/PP-OCRv5-server-det") +@strict(accept_kwargs=True) +class PPOCRV5ServerDetConfig(PreTrainedConfig): + r""" interpolate_mode (`str`, *optional*, defaults to `"nearest"`): The interpolation mode used for upsampling or downsampling feature maps in the neck network. neck_out_channels (`int`, *optional*, defaults to 256): @@ -43,31 +46,26 @@ A list of scaling factors used for spatial resolution adjustments in the feature maps. kernel_list (`list[int]`, *optional*, defaults to `[3, 2, 2]`): The list of kernel sizes for convolutional layers in the head network for multi-scale feature extraction. - """, -) -class PPOCRV5ServerDetConfig(PreTrainedConfig): + """ + sub_configs = {"backbone_config": AutoConfig} model_type = "pp_ocrv5_server_det" - def __init__( - self, - interpolate_mode: str = "nearest", - backbone_config=None, - neck_out_channels: int = 256, - reduce_factor: int = 2, - intraclass_block_number: int = 4, - intraclass_block_config: dict | None = None, - scale_factor: int = 2, - scale_factor_list: list | None = None, - hidden_act: str = "relu", - kernel_list: list | None = None, - **kwargs, - ): - self.interpolate_mode = interpolate_mode + interpolate_mode: str = "nearest" + backbone_config: dict | PreTrainedConfig | None = None + neck_out_channels: int = 256 + reduce_factor: int = 2 + intraclass_block_number: int = 4 + intraclass_block_config: dict | None = None + scale_factor: int = 2 + scale_factor_list: list | None = None + hidden_act: str = "relu" + kernel_list: list | None = None + id2label: dict[int, str] | dict[str, str] | None = None - # ---- backbone ---- - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="hgnet_v2", default_config_kwargs={ "arch": "L", @@ -80,25 +78,10 @@ def __init__( }, **kwargs, ) - self.backbone_config = backbone_config - - # ---- neck ---- - self.neck_out_channels = neck_out_channels - self.reduce_factor = reduce_factor - self.scale_factor_list = scale_factor_list - self.intraclass_block_number = intraclass_block_number - self.intraclass_block_config = intraclass_block_config - - # ---- head ---- - self.scale_factor = scale_factor - self.hidden_act = hidden_act - self.kernel_list = kernel_list # For object detection pipeline compatibility: single class "text" - self.id2label = {0: "text"} - self.num_labels = 1 - - super().__init__(**kwargs) + self.id2label = {0: "text"} if self.id2label is None else self.id2label + super().__post_init__(**kwargs) __all__ = ["PPOCRV5ServerDetConfig"] diff --git a/src/transformers/models/pp_ocrv5_server_det/modular_pp_ocrv5_server_det.py b/src/transformers/models/pp_ocrv5_server_det/modular_pp_ocrv5_server_det.py index b1ef84d774e2..b3a16be893b0 100644 --- a/src/transformers/models/pp_ocrv5_server_det/modular_pp_ocrv5_server_det.py +++ b/src/transformers/models/pp_ocrv5_server_det/modular_pp_ocrv5_server_det.py @@ -20,6 +20,7 @@ import torch.nn as nn import torch.nn.functional as F import torchvision.transforms.v2.functional as tvF +from huggingface_hub.dataclasses import strict from ...activations import ACT2FN from ...backbone_utils import consolidate_backbone_kwargs_to_config, load_backbone @@ -56,9 +57,10 @@ logger = logging.get_logger(__name__) -@auto_docstring( - checkpoint="PaddlePaddle/PP-OCRv5-server-det", - custom_args=r""" +@auto_docstring(checkpoint="PaddlePaddle/PP-OCRv5-server-det") +@strict(accept_kwargs=True) +class PPOCRV5ServerDetConfig(PreTrainedConfig): + r""" interpolate_mode (`str`, *optional*, defaults to `"nearest"`): The interpolation mode used for upsampling or downsampling feature maps in the neck network. neck_out_channels (`int`, *optional*, defaults to 256): @@ -75,31 +77,26 @@ A list of scaling factors used for spatial resolution adjustments in the feature maps. kernel_list (`list[int]`, *optional*, defaults to `[3, 2, 2]`): The list of kernel sizes for convolutional layers in the head network for multi-scale feature extraction. - """, -) -class PPOCRV5ServerDetConfig(PreTrainedConfig): + """ + sub_configs = {"backbone_config": AutoConfig} model_type = "pp_ocrv5_server_det" - def __init__( - self, - interpolate_mode: str = "nearest", - backbone_config=None, - neck_out_channels: int = 256, - reduce_factor: int = 2, - intraclass_block_number: int = 4, - intraclass_block_config: dict | None = None, - scale_factor: int = 2, - scale_factor_list: list | None = None, - hidden_act: str = "relu", - kernel_list: list | None = None, - **kwargs, - ): - self.interpolate_mode = interpolate_mode - - # ---- backbone ---- - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + interpolate_mode: str = "nearest" + backbone_config: dict | PreTrainedConfig | None = None + neck_out_channels: int = 256 + reduce_factor: int = 2 + intraclass_block_number: int = 4 + intraclass_block_config: dict | None = None + scale_factor: int = 2 + scale_factor_list: list | None = None + hidden_act: str = "relu" + kernel_list: list | None = None + id2label: dict[int, str] | dict[str, str] | None = None + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="hgnet_v2", default_config_kwargs={ "arch": "L", @@ -112,25 +109,10 @@ def __init__( }, **kwargs, ) - self.backbone_config = backbone_config - - # ---- neck ---- - self.neck_out_channels = neck_out_channels - self.reduce_factor = reduce_factor - self.scale_factor_list = scale_factor_list - self.intraclass_block_number = intraclass_block_number - self.intraclass_block_config = intraclass_block_config - - # ---- head ---- - self.scale_factor = scale_factor - self.hidden_act = hidden_act - self.kernel_list = kernel_list # For object detection pipeline compatibility: single class "text" - self.id2label = {0: "text"} - self.num_labels = 1 - - super().__init__(**kwargs) + self.id2label = {0: "text"} if self.id2label is None else self.id2label + super().__post_init__(**kwargs) class PPOCRV5ServerDetImageProcessorKwargs(ImagesKwargs, total=False): diff --git a/src/transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py index 7349e34c6afc..e385088f23dc 100644 --- a/src/transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +++ b/src/transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py @@ -17,6 +17,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -24,6 +26,7 @@ @auto_docstring(checkpoint="LiheYoung/depth-anything-small-hf") +@strict(accept_kwargs=True) class PromptDepthAnythingConfig(PreTrainedConfig): r""" reassemble_hidden_size (`int`, *optional*, defaults to 384): @@ -62,23 +65,21 @@ class PromptDepthAnythingConfig(PreTrainedConfig): model_type = "prompt_depth_anything" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - backbone_config=None, - patch_size=14, - initializer_range=0.02, - reassemble_hidden_size=384, - reassemble_factors=[4, 2, 1, 0.5], - neck_hidden_sizes=[48, 96, 192, 384], - fusion_hidden_size=64, - head_in_index=-1, - head_hidden_size=32, - depth_estimation_type="relative", - max_depth=None, - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + patch_size: int | list[int] | tuple[int, int] = 14 + initializer_range: float = 0.02 + reassemble_hidden_size: int = 384 + reassemble_factors: list[int | float] | tuple[int | float, ...] = (4, 2, 1, 0.5) + neck_hidden_sizes: list[int] | tuple[int, ...] = (48, 96, 192, 384) + fusion_hidden_size: int = 64 + head_in_index: int = -1 + head_hidden_size: int = 32 + depth_estimation_type: str = "relative" + max_depth: int | None = None + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="dinov2", default_config_kwargs={ "image_size": 518, @@ -90,21 +91,13 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - self.reassemble_hidden_size = reassemble_hidden_size - self.patch_size = patch_size - self.initializer_range = initializer_range - self.reassemble_factors = reassemble_factors - self.neck_hidden_sizes = neck_hidden_sizes - self.fusion_hidden_size = fusion_hidden_size - self.head_in_index = head_in_index - self.head_hidden_size = head_hidden_size - if depth_estimation_type not in ["relative", "metric"]: - raise ValueError("depth_estimation_type must be one of ['relative', 'metric']") - self.depth_estimation_type = depth_estimation_type - self.max_depth = max_depth if max_depth else 1 + self.max_depth = self.max_depth if self.max_depth else 1 + super().__post_init__(**kwargs) - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.depth_estimation_type not in ["relative", "metric"]: + raise ValueError("depth_estimation_type must be one of ['relative', 'metric']") __all__ = ["PromptDepthAnythingConfig"] diff --git a/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py index 8d150c4d1b5d..6658a240230b 100644 --- a/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +++ b/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py @@ -32,7 +32,7 @@ class PromptDepthAnythingConfig(DepthAnythingConfig): - model_type = "prompt_depth_anything" + pass class PromptDepthAnythingLayer(nn.Module): diff --git a/src/transformers/models/prophetnet/configuration_prophetnet.py b/src/transformers/models/prophetnet/configuration_prophetnet.py index c7f719987061..60484fb08e73 100644 --- a/src/transformers/models/prophetnet/configuration_prophetnet.py +++ b/src/transformers/models/prophetnet/configuration_prophetnet.py @@ -13,16 +13,14 @@ # limitations under the License. """ProphetNet model configuration""" -from collections.abc import Callable +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/prophetnet-large-uncased") +@strict(accept_kwargs=True) class ProphetNetConfig(PreTrainedConfig): r""" ngram (`int`, *optional*, defaults to 2): @@ -47,72 +45,34 @@ class ProphetNetConfig(PreTrainedConfig): "num_attention_heads": "num_encoder_attention_heads", } - def __init__( - self, - activation_dropout: float | None = 0.1, - activation_function: str | Callable | None = "gelu", - vocab_size: int | None = 30522, - hidden_size: int | None = 1024, - encoder_ffn_dim: int | None = 4096, - num_encoder_layers: int | None = 12, - num_encoder_attention_heads: int | None = 16, - decoder_ffn_dim: int | None = 4096, - num_decoder_layers: int | None = 12, - num_decoder_attention_heads: int | None = 16, - attention_dropout: float | None = 0.1, - dropout: float | None = 0.1, - max_position_embeddings: int | None = 512, - init_std: float | None = 0.02, - is_encoder_decoder: bool | None = True, - add_cross_attention: bool | None = True, - decoder_start_token_id: int | None = 0, - ngram: int | None = 2, - num_buckets: int | None = 32, - relative_max_distance: int | None = 128, - disable_ngram_loss: bool | None = False, - eps: float | None = 0.0, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - is_decoder: bool | None = False, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.encoder_ffn_dim = encoder_ffn_dim - self.num_encoder_layers = num_encoder_layers - self.num_encoder_attention_heads = num_encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.num_decoder_layers = num_decoder_layers - self.num_decoder_attention_heads = num_decoder_attention_heads - self.max_position_embeddings = max_position_embeddings - self.init_std = init_std # Normal(0, this parameter) - self.activation_function = activation_function - - # parameters for prophetnet - self.ngram = ngram - self.num_buckets = num_buckets - self.relative_max_distance = relative_max_distance - self.disable_ngram_loss = disable_ngram_loss - self.eps = eps - - # 3 Types of Dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.dropout = dropout - - self.use_cache = use_cache - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.add_cross_attention = add_cross_attention - self.decoder_start_token_id = decoder_start_token_id - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + activation_dropout: float | int = 0.1 + activation_function: str = "gelu" + vocab_size: int = 30522 + hidden_size: int = 1024 + encoder_ffn_dim: int = 4096 + num_encoder_layers: int = 12 + num_encoder_attention_heads: int = 16 + decoder_ffn_dim: int = 4096 + num_decoder_layers: int = 12 + num_decoder_attention_heads: int = 16 + attention_dropout: float | int = 0.1 + dropout: float | int = 0.1 + max_position_embeddings: int = 512 + init_std: float = 0.02 + is_encoder_decoder: bool = True + add_cross_attention: bool = True + decoder_start_token_id: int | None = 0 + ngram: int = 2 + num_buckets: int = 32 + relative_max_distance: int = 128 + disable_ngram_loss: bool = False + eps: float = 0.0 + use_cache: bool = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + is_decoder: bool = False + tie_word_embeddings: bool = True @property def num_hidden_layers(self) -> int: diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py index c8b8db547a7c..47a721bf16ab 100644 --- a/src/transformers/models/prophetnet/modeling_prophetnet.py +++ b/src/transformers/models/prophetnet/modeling_prophetnet.py @@ -993,7 +993,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is None and inputs_embeds is None: raise ValueError("Either input_ids or inputs_embeds has to be passed.") @@ -1114,7 +1114,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is None and inputs_embeds is None: raise ValueError("Either `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.") @@ -1437,7 +1437,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: encoder_outputs = self.encoder( @@ -1561,7 +1561,7 @@ def forward( >>> logits_next_token = outputs.logits # logits to predict next token as usual >>> logits_ngram_next_tokens = outputs.logits_ngram # logits to predict 2nd, 3rd, ... next tokens ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right @@ -1749,7 +1749,7 @@ def forward( >>> loss = outputs.loss ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) outputs = self.prophetnet.decoder( diff --git a/src/transformers/models/pvt/configuration_pvt.py b/src/transformers/models/pvt/configuration_pvt.py index 9a488ca1d85b..af03b4f36b83 100644 --- a/src/transformers/models/pvt/configuration_pvt.py +++ b/src/transformers/models/pvt/configuration_pvt.py @@ -15,16 +15,14 @@ # limitations under the License. """Pvt model configuration""" -from collections.abc import Callable, Mapping +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="Xrenya/pvt-tiny-224") +@strict(accept_kwargs=True) class PvtConfig(PreTrainedConfig): r""" num_encoder_blocks (`int`, *optional*, defaults to 4): @@ -62,48 +60,24 @@ class PvtConfig(PreTrainedConfig): model_type = "pvt" - def __init__( - self, - image_size: int = 224, - num_channels: int = 3, - num_encoder_blocks: int = 4, - depths: list[int] = [2, 2, 2, 2], - sequence_reduction_ratios: list[int] = [8, 4, 2, 1], - hidden_sizes: list[int] = [64, 128, 320, 512], - patch_sizes: list[int] = [4, 2, 2, 2], - strides: list[int] = [4, 2, 2, 2], - num_attention_heads: list[int] = [1, 2, 5, 8], - mlp_ratios: list[int] = [8, 8, 4, 4], - hidden_act: Mapping[str, Callable] = "gelu", - hidden_dropout_prob: float = 0.0, - attention_probs_dropout_prob: float = 0.0, - initializer_range: float = 0.02, - drop_path_rate: float = 0.0, - layer_norm_eps: float = 1e-6, - qkv_bias: bool = True, - num_labels: int = 1000, - **kwargs, - ): - super().__init__(**kwargs) - - self.image_size = image_size - self.num_channels = num_channels - self.num_encoder_blocks = num_encoder_blocks - self.depths = depths - self.sequence_reduction_ratios = sequence_reduction_ratios - self.hidden_sizes = hidden_sizes - self.patch_sizes = patch_sizes - self.strides = strides - self.mlp_ratios = mlp_ratios - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.drop_path_rate = drop_path_rate - self.layer_norm_eps = layer_norm_eps - self.num_labels = num_labels - self.qkv_bias = qkv_bias + image_size: int | list[int] | tuple[int, int] = 224 + num_channels: int = 3 + num_encoder_blocks: int = 4 + depths: list[int] | tuple[int, ...] = (2, 2, 2, 2) + sequence_reduction_ratios: list[int] | tuple[int, ...] = (8, 4, 2, 1) + hidden_sizes: list[int] | tuple[int, ...] = (64, 128, 320, 512) + patch_sizes: list[int] | tuple[int, ...] = (4, 2, 2, 2) + strides: list[int] | tuple[int, ...] = (4, 2, 2, 2) + num_attention_heads: list[int] | tuple[int, ...] = (1, 2, 5, 8) + mlp_ratios: list[int] | tuple[int, ...] = (8, 8, 4, 4) + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + drop_path_rate: float = 0.0 + layer_norm_eps: float = 1e-6 + qkv_bias: bool = True + num_labels: int = 1000 __all__ = ["PvtConfig"] diff --git a/src/transformers/models/pvt/modeling_pvt.py b/src/transformers/models/pvt/modeling_pvt.py index dd22b7a1c5d7..77be8470212d 100755 --- a/src/transformers/models/pvt/modeling_pvt.py +++ b/src/transformers/models/pvt/modeling_pvt.py @@ -462,7 +462,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict encoder_outputs = self.encoder( pixel_values=pixel_values, @@ -519,7 +519,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.pvt( pixel_values=pixel_values, diff --git a/src/transformers/models/pvt_v2/configuration_pvt_v2.py b/src/transformers/models/pvt_v2/configuration_pvt_v2.py index 253465f84ebf..e880a9c22944 100644 --- a/src/transformers/models/pvt_v2/configuration_pvt_v2.py +++ b/src/transformers/models/pvt_v2/configuration_pvt_v2.py @@ -15,17 +15,15 @@ # limitations under the License. """Pvt V2 model configuration""" -from collections.abc import Callable +from huggingface_hub.dataclasses import strict from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="OpenGVLab/pvt_v2_b0") +@strict(accept_kwargs=True) class PvtV2Config(BackboneConfigMixin, PreTrainedConfig): r""" sr_ratios (`list[int]`, *optional*, defaults to `[8, 4, 2, 1]`): @@ -62,54 +60,34 @@ class PvtV2Config(BackboneConfigMixin, PreTrainedConfig): model_type = "pvt_v2" - def __init__( - self, - image_size: int | tuple[int, int] = 224, - num_channels: int = 3, - num_encoder_blocks: int = 4, - depths: list[int] = [2, 2, 2, 2], - sr_ratios: list[int] = [8, 4, 2, 1], - hidden_sizes: list[int] = [32, 64, 160, 256], - patch_sizes: list[int] = [7, 3, 3, 3], - strides: list[int] = [4, 2, 2, 2], - num_attention_heads: list[int] = [1, 2, 5, 8], - mlp_ratios: list[int] = [8, 8, 4, 4], - hidden_act: str | Callable = "gelu", - hidden_dropout_prob: float = 0.0, - attention_probs_dropout_prob: float = 0.0, - initializer_range: float = 0.02, - drop_path_rate: float = 0.0, - layer_norm_eps: float = 1e-6, - qkv_bias: bool = True, - linear_attention: bool = False, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) - - image_size = (image_size, image_size) if isinstance(image_size, int) else image_size - - self.image_size = image_size - self.num_channels = num_channels - self.num_encoder_blocks = num_encoder_blocks - self.depths = depths - self.sr_ratios = sr_ratios - self.hidden_sizes = hidden_sizes - self.patch_sizes = patch_sizes - self.strides = strides - self.mlp_ratios = mlp_ratios - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.drop_path_rate = drop_path_rate - self.layer_norm_eps = layer_norm_eps - self.qkv_bias = qkv_bias - self.linear_attention = linear_attention - self.stage_names = [f"stage{idx}" for idx in range(1, len(depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + image_size: int | list[int] | tuple[int, int] | dict = 224 + num_channels: int = 3 + num_encoder_blocks: int = 4 + depths: list[int] | tuple[int, ...] = (2, 2, 2, 2) + sr_ratios: list[int] | tuple[int, ...] = (8, 4, 2, 1) + hidden_sizes: list[int] | tuple[int, ...] = (32, 64, 160, 256) + patch_sizes: list[int] | tuple[int, ...] = (7, 3, 3, 3) + strides: list[int] | tuple[int, ...] = (4, 2, 2, 2) + num_attention_heads: list[int] | tuple[int, ...] = (1, 2, 5, 8) + mlp_ratios: list[int] | tuple[int, ...] = (8, 8, 4, 4) + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + drop_path_rate: float = 0.0 + layer_norm_eps: float = 1e-6 + qkv_bias: bool = True + linear_attention: bool = False + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + + def __post_init__(self, **kwargs): + self.image_size = (self.image_size, self.image_size) if isinstance(self.image_size, int) else self.image_size + self.stage_names = [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["PvtV2Config"] diff --git a/src/transformers/models/pvt_v2/modeling_pvt_v2.py b/src/transformers/models/pvt_v2/modeling_pvt_v2.py index 8809b2b4b6ca..4ecf3a92b06d 100644 --- a/src/transformers/models/pvt_v2/modeling_pvt_v2.py +++ b/src/transformers/models/pvt_v2/modeling_pvt_v2.py @@ -411,7 +411,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict encoder_outputs = self.encoder( pixel_values=pixel_values, @@ -468,7 +468,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.pvt_v2( pixel_values=pixel_values, @@ -553,7 +553,7 @@ def forward( >>> list(feature_maps[-1].shape) [1, 256, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index f9726bf27a0c..f3f17cdc295b 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -13,15 +13,15 @@ # limitations under the License. """Qwen2 model configuration""" -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters +from ...utils import auto_docstring @auto_docstring(checkpoint="Qwen/Qwen2-7B") +@strict(accept_kwargs=True) class Qwen2Config(PreTrainedConfig): r""" Example: @@ -58,53 +58,33 @@ class Qwen2Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 151936, - hidden_size: int | None = 4096, - intermediate_size: int | None = 22016, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 32, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - use_sliding_window: bool | None = False, - sliding_window: int | None = 4096, - max_window_layers: int | None = 28, - layer_types: list[str] | None = None, - attention_dropout: float | None = 0.0, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window if self.use_sliding_window else None - self.max_window_layers = max_window_layers - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - - self.layer_types = layer_types + vocab_size: int = 151936 + hidden_size: int = 4096 + intermediate_size: int = 22016 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 32 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + use_sliding_window: bool = False + sliding_window: int | None = 4096 + max_window_layers: int = 28 + layer_types: list[str] | None = None + attention_dropout: float | int = 0.0 + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): + self.sliding_window = self.sliding_window if self.use_sliding_window else None + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + if self.layer_types is None: self.layer_types = [ "sliding_attention" @@ -112,15 +92,8 @@ def __init__( else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Qwen2Config"] diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index d46536659543..bf05358e54ae 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -18,7 +18,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -27,6 +29,7 @@ @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniVisionEncoderConfig(PreTrainedConfig): r""" fullatt_block_indexes (`int`, *optional*, defaults to `[7, 15, 23, 31]`): @@ -54,41 +57,23 @@ class Qwen2_5OmniVisionEncoderConfig(PreTrainedConfig): model_type = "qwen2_5_omni_vision_encoder" base_config_key = "vision_config" - def __init__( - self, - depth=32, - hidden_size=3584, - hidden_act="silu", - intermediate_size=3420, - num_heads=16, - in_channels=3, - patch_size=14, - spatial_merge_size=2, - temporal_patch_size=2, - window_size=112, - out_hidden_size=3584, - fullatt_block_indexes=[7, 15, 23, 31], - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.window_size = window_size - self.fullatt_block_indexes = fullatt_block_indexes - self.out_hidden_size = out_hidden_size - self.initializer_range = initializer_range + depth: int = 32 + hidden_size: int = 3584 + hidden_act: str = "silu" + intermediate_size: int = 3420 + num_heads: int = 16 + in_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 14 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + window_size: int = 112 + out_hidden_size: int = 3584 + fullatt_block_indexes: list[int] | tuple[int, ...] = (7, 15, 23, 31) + initializer_range: float = 0.02 @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniAudioEncoderConfig(PreTrainedConfig): r""" max_source_positions (`int`, *optional*, defaults to 1500): @@ -114,45 +99,27 @@ class Qwen2_5OmniAudioEncoderConfig(PreTrainedConfig): ```""" model_type = "qwen2_5_omni_audio_encoder" + attribute_map = {"num_hidden_layers": "encoder_layers"} + + num_mel_bins: int = 128 + encoder_layers: int = 32 + encoder_attention_heads: int = 20 + encoder_ffn_dim: int = 5120 + d_model: int = 1280 + dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + activation_function: str = "gelu" + activation_dropout: float | int = 0.0 + scale_embedding: bool = False + initializer_range: float = 0.02 + max_source_positions: int = 1500 - def __init__( - self, - num_mel_bins=128, - encoder_layers=32, - encoder_attention_heads=20, - encoder_ffn_dim=5120, - d_model=1280, - dropout=0, - attention_dropout=0, - activation_function="gelu", - activation_dropout=0, - scale_embedding=False, - initializer_range=0.02, - max_source_positions=1500, - n_window=100, - output_dim=3584, - **kwargs, - ): - super().__init__(**kwargs) - - self.num_mel_bins = num_mel_bins - self.d_model = d_model - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.encoder_ffn_dim = encoder_ffn_dim - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_function = activation_function - self.activation_dropout = activation_dropout - self.num_hidden_layers = encoder_layers - self.initializer_range = initializer_range - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.max_source_positions = max_source_positions - self.n_window = n_window - self.output_dim = output_dim + n_window: int = 100 + output_dim: int = 3584 @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniTextConfig(PreTrainedConfig): r""" max_window_layers (`int`, *optional*, defaults to 28): @@ -199,58 +166,35 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + ignore_keys_at_rope_validation = {"mrope_section"} + + vocab_size: int = 152064 + hidden_size: int = 3584 + intermediate_size: int = 18944 + num_hidden_layers: int = 28 + num_attention_heads: int = 28 + num_key_value_heads: int | None = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + rope_parameters: RopeParameters | dict | None = None + use_sliding_window: bool = False + sliding_window: int | None = 32768 + max_window_layers: int = 28 + layer_types: list[str] | None = None + attention_dropout: float | int = 0.0 + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.sliding_window = self.sliding_window if self.use_sliding_window else None + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - def __init__( - self, - vocab_size: int | None = 152064, - hidden_size: int | None = 3584, - intermediate_size: int | None = 18944, - num_hidden_layers: int | None = 28, - num_attention_heads: int | None = 28, - num_key_value_heads: int | None = 4, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - use_sliding_window: bool | None = False, - sliding_window: int | None = 32768, - max_window_layers: int | None = 28, - layer_types: list[str] | None = None, - attention_dropout: float | None = 0.0, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window if self.use_sliding_window else None - self.max_window_layers = max_window_layers - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - - self.layer_types = layer_types if self.layer_types is None: self.layer_types = [ "sliding_attention" @@ -258,16 +202,12 @@ def __init__( else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__( - ignore_keys_at_rope_validation={"mrope_section"}, - **kwargs, - ) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniThinkerConfig(PreTrainedConfig): r""" position_id_per_seconds (`int`, *optional*, defaults to 25): @@ -317,56 +257,41 @@ class Qwen2_5OmniThinkerConfig(PreTrainedConfig): "text_config": Qwen2_5OmniTextConfig, } - def __init__( - self, - audio_config=None, - vision_config=None, - text_config=None, - audio_token_index=151646, - image_token_index=151655, - video_token_index=151656, - position_id_per_seconds=25, - seconds_per_chunk=2, - audio_start_token_id=151647, - audio_end_token_id=151648, - user_token_id=872, - initializer_range=0.02, - tie_word_embeddings=False, - **kwargs, - ): - self.audio_token_index = audio_token_index - self.image_token_index = image_token_index - self.video_token_index = video_token_index - self.user_token_id = user_token_id - self.position_id_per_seconds = position_id_per_seconds - self.seconds_per_chunk = seconds_per_chunk - self.audio_start_token_id = audio_start_token_id - self.audio_end_token_id = audio_end_token_id - self.initializer_range = initializer_range - self.tie_word_embeddings = tie_word_embeddings - - if isinstance(vision_config, dict): - vision_config = Qwen2_5OmniVisionEncoderConfig(**vision_config) - elif vision_config is None: - vision_config = Qwen2_5OmniVisionEncoderConfig() - self.vision_config = vision_config - - if isinstance(audio_config, dict): - audio_config = Qwen2_5OmniAudioEncoderConfig(**audio_config) - elif audio_config is None: - audio_config = Qwen2_5OmniAudioEncoderConfig() - self.audio_config = audio_config - - if isinstance(text_config, dict): - text_config = Qwen2_5OmniTextConfig(**text_config) - elif text_config is None: - text_config = Qwen2_5OmniTextConfig() - self.text_config = text_config - - super().__init__(**kwargs) + audio_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + audio_token_index: int = 151646 + image_token_index: int = 151655 + video_token_index: int = 151656 + position_id_per_seconds: int = 25 + seconds_per_chunk: int = 2 + audio_start_token_id: int = 151647 + audio_end_token_id: int = 151648 + user_token_id: int = 872 + initializer_range: float = 0.02 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = Qwen2_5OmniVisionEncoderConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = Qwen2_5OmniVisionEncoderConfig() + + if isinstance(self.audio_config, dict): + self.audio_config = Qwen2_5OmniAudioEncoderConfig(**self.audio_config) + elif self.audio_config is None: + self.audio_config = Qwen2_5OmniAudioEncoderConfig() + + if isinstance(self.text_config, dict): + self.text_config = Qwen2_5OmniTextConfig(**self.text_config) + elif self.text_config is None: + self.text_config = Qwen2_5OmniTextConfig() + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniTalkerConfig(PreTrainedConfig): r""" tts_text_start_token_id (`int`, *optional*, defaults to 151860): @@ -423,97 +348,53 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig): "video_token_id": "video_token_index", "audio_token_id": "audio_token_index", } + ignore_keys_at_rope_validation = {"mrope_section"} + + audio_token_index: int = 151646 + image_token_index: int = 151655 + video_token_index: int = 151656 + vocab_size: int = 8448 + tts_text_start_token_id: int = 151860 + tts_text_end_token_id: int = 151861 + tts_text_pad_token_id: int = 151859 + tts_codec_start_token_id: int = 8293 + tts_codec_end_token_id: int = 8294 + tts_codec_pad_token_id: int = 8292 + tts_codec_mask_token_id: int = 8296 + vision_start_token_id: int = 151652 + vision_end_token_id: int = 151653 + embedding_size: int = 3584 + hidden_size: int = 3584 + intermediate_size: int = 18944 + num_hidden_layers: int = 28 + num_attention_heads: int = 28 + num_key_value_heads: int = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + rms_norm_eps: float = 1e-06 + head_dim: int = 128 + use_cache: bool = True + tie_word_embeddings: bool = False + use_sliding_window: bool = False + sliding_window: int | None = 32768 + max_window_layers: int = 28 + attention_dropout: float | int = 0.0 + rope_parameters: RopeParameters | dict | None = None + position_id_per_seconds: int = 25 + seconds_per_chunk: int = 2 + audio_start_token_id: int = 151647 + audio_end_token_id: int = 151648 + initializer_range: float = 0.02 + spatial_merge_size: int = 2 + layer_types: list[str] | None = None + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + self.sliding_window = self.sliding_window if self.use_sliding_window else None + + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - def __init__( - self, - audio_token_index=151646, - image_token_index=151655, - video_token_index=151656, - vocab_size=8448, - tts_text_start_token_id=151860, - tts_text_end_token_id=151861, - tts_text_pad_token_id=151859, - tts_codec_start_token_id=8293, - tts_codec_end_token_id=8294, - tts_codec_pad_token_id=8292, - tts_codec_mask_token_id=8296, - vision_start_token_id=151652, - vision_end_token_id=151653, - embedding_size=3584, - hidden_size=3584, - intermediate_size=18944, - num_hidden_layers=28, - num_attention_heads=28, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=32768, - rms_norm_eps=1e-06, - head_dim=128, - use_cache=True, - tie_word_embeddings=False, - use_sliding_window=False, - sliding_window=32768, - max_window_layers=28, - attention_dropout=0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - position_id_per_seconds=25, - seconds_per_chunk=2, - audio_start_token_id=151647, - audio_end_token_id=151648, - initializer_range=0.02, - spatial_merge_size=2, - layer_types=None, - pad_token_id: int | None = None, - **kwargs, - ): - self.audio_token_index = audio_token_index - self.image_token_index = image_token_index - self.video_token_index = video_token_index - - self.tts_text_start_token_id = tts_text_start_token_id - self.tts_text_end_token_id = tts_text_end_token_id - self.tts_text_pad_token_id = tts_text_pad_token_id - self.tts_codec_start_token_id = tts_codec_start_token_id - self.tts_codec_end_token_id = tts_codec_end_token_id - self.tts_codec_pad_token_id = tts_codec_pad_token_id - - self.tts_codec_mask_token_id = tts_codec_mask_token_id - - self.vision_start_token_id = vision_start_token_id - self.vision_end_token_id = vision_end_token_id - - self.vocab_size = vocab_size - self.head_dim = head_dim - self.embedding_size = embedding_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window if self.use_sliding_window else None - self.max_window_layers = max_window_layers - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.position_id_per_seconds = position_id_per_seconds # zf - self.seconds_per_chunk = seconds_per_chunk # zf - self.audio_start_token_id = audio_start_token_id # zf - self.audio_end_token_id = audio_end_token_id # zf - self.pad_token_id = pad_token_id - - self.initializer_range = initializer_range - self.spatial_merge_size = spatial_merge_size - self.tie_word_embeddings = tie_word_embeddings - - self.layer_types = layer_types if self.layer_types is None: self.layer_types = [ "sliding_attention" @@ -521,13 +402,12 @@ def __init__( else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniDiTConfig(PreTrainedConfig): r""" ff_mult (`int`, *optional*, defaults to 2): @@ -566,61 +446,33 @@ class Qwen2_5OmniDiTConfig(PreTrainedConfig): model_type = "qwen2_5_omni_dit" - def __init__( - self, - hidden_size=1024, - num_hidden_layers=22, - num_attention_heads=16, - ff_mult=2, - emb_dim=512, - head_dim=64, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - max_position_embeddings=32768, - block_size=24, - look_ahead_layers=[10], - look_backward_layers=[0, 20], - repeats=2, - num_embeds=8193, - mel_dim=80, - dropout=0.1, - enc_emb_dim=192, - enc_dim=128, - enc_channels=[256, 256, 256, 256, 768], - enc_kernel_sizes=[5, 3, 3, 3, 1], - enc_dilations=[1, 2, 3, 4, 1], - enc_attention_channels=64, - enc_res2net_scale=2, - enc_se_channels=64, - **kwargs, - ): - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.ff_mult = ff_mult - self.emb_dim = emb_dim - self.head_dim = head_dim - self.max_position_embeddings = max_position_embeddings - self.block_size = block_size - self.look_ahead_layers = look_ahead_layers - self.look_backward_layers = look_backward_layers - self.repeats = repeats - self.num_embeds = num_embeds - self.mel_dim = mel_dim - self.dropout = dropout - self.enc_emb_dim = enc_emb_dim - self.enc_dim = enc_dim - self.enc_channels = enc_channels - self.enc_kernel_sizes = enc_kernel_sizes - self.enc_dilations = enc_dilations - self.enc_attention_channels = enc_attention_channels - self.enc_res2net_scale = enc_res2net_scale - self.enc_se_channels = enc_se_channels - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + hidden_size: int = 1024 + num_hidden_layers: int = 22 + num_attention_heads: int = 16 + ff_mult: int = 2 + emb_dim: int = 512 + head_dim: int = 64 + rope_parameters: RopeParameters | dict | None = None + max_position_embeddings: int = 32768 + block_size: int = 24 + look_ahead_layers: list[int] | tuple[int, ...] = (10,) + look_backward_layers: list[int] | tuple[int, ...] = (0, 20) + repeats: int = 2 + num_embeds: int = 8193 + mel_dim: int = 80 + dropout: float | int = 0.1 + enc_emb_dim: int = 192 + enc_dim: int = 128 + enc_channels: list[int] | tuple[int, ...] = (256, 256, 256, 256, 768) + enc_kernel_sizes: list[int] | tuple[int, ...] = (5, 3, 3, 3, 1) + enc_dilations: list[int] | tuple[int, ...] = (1, 2, 3, 4, 1) + enc_attention_channels: int = 64 + enc_res2net_scale: int = 2 + enc_se_channels: int = 64 @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniBigVGANConfig(PreTrainedConfig): r""" mel_dim (`int`, *optional*, defaults to 80): @@ -639,26 +491,16 @@ class Qwen2_5OmniBigVGANConfig(PreTrainedConfig): model_type = "qwen2_5_omni_bigvgan" - def __init__( - self, - mel_dim=80, - upsample_initial_channel=1536, - resblock_kernel_sizes=[3, 7, 11], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], - upsample_rates=[5, 3, 2, 2, 2, 2], - upsample_kernel_sizes=[11, 7, 4, 4, 4, 4], - **kwargs, - ): - self.mel_dim = mel_dim - self.upsample_initial_channel = upsample_initial_channel - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_kernel_sizes = upsample_kernel_sizes - super().__init__(**kwargs) + mel_dim: int = 80 + upsample_initial_channel: int = 1536 + resblock_kernel_sizes: list[int] | tuple[int, ...] = (3, 7, 11) + resblock_dilation_sizes: list | tuple = ((1, 3, 5), (1, 3, 5), (1, 3, 5)) + upsample_rates: list[int] | tuple[int, ...] = (5, 3, 2, 2, 2, 2) + upsample_kernel_sizes: list[int] | tuple[int, ...] = (11, 7, 4, 4, 4, 4) @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniToken2WavConfig(PreTrainedConfig): r""" dit_config ([`DiT_Args`], *optional*): @@ -702,17 +544,25 @@ class Qwen2_5OmniToken2WavConfig(PreTrainedConfig): "bigvgan_config": Qwen2_5OmniBigVGANConfig, } - def __init__(self, dit_config=None, bigvgan_config=None, **kwargs): - if dit_config is None: - dit_config = {} - if bigvgan_config is None: - bigvgan_config = {} - self.dit_config = Qwen2_5OmniDiTConfig(**dit_config) - self.bigvgan_config = Qwen2_5OmniBigVGANConfig(**bigvgan_config) - super().__init__(**kwargs) + dit_config: dict | PreTrainedConfig | None = None + bigvgan_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): + if self.dit_config is None: + self.dit_config = Qwen2_5OmniDiTConfig() + elif isinstance(self.dit_config, dict): + self.dit_config = Qwen2_5OmniDiTConfig(**self.dit_config) + + if self.bigvgan_config is None: + self.bigvgan_config = Qwen2_5OmniBigVGANConfig() + elif isinstance(self.bigvgan_config, dict): + self.bigvgan_config = Qwen2_5OmniBigVGANConfig(**self.bigvgan_config) + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniConfig(PreTrainedConfig): """ thinker_config (`dict`, *optional*): Configuration of the underlying thinker sub-model. @@ -757,32 +607,31 @@ class Qwen2_5OmniConfig(PreTrainedConfig): "token2wav_config": Qwen2_5OmniToken2WavConfig, } - def __init__( - self, - thinker_config=None, - talker_config=None, - token2wav_config=None, - enable_audio_output: bool = True, - **kwargs, - ): - if thinker_config is None: - thinker_config = {} + thinker_config: dict | PreTrainedConfig | None = None + talker_config: dict | PreTrainedConfig | None = None + token2wav_config: dict | PreTrainedConfig | None = None + enable_audio_output: bool = True + + def __post_init__(self, **kwargs): + if self.thinker_config is None: + self.thinker_config = Qwen2_5OmniThinkerConfig() logger.info("thinker_config is None. Initializing thinker model with default values") + elif isinstance(self.thinker_config, dict): + self.thinker_config = Qwen2_5OmniThinkerConfig(**self.thinker_config) - if talker_config is None: - talker_config = {} + if self.talker_config is None: + self.talker_config = Qwen2_5OmniTalkerConfig() logger.info("talker_config is None. Initializing talker model with default values") + elif isinstance(self.talker_config, dict): + self.talker_config = Qwen2_5OmniTalkerConfig(**self.talker_config) - if token2wav_config is None: - token2wav_config = {} + if self.token2wav_config is None: + self.token2wav_config = Qwen2_5OmniToken2WavConfig() logger.info("token2wav_config is None. Initializing token2wav model with default values") + elif isinstance(self.token2wav_config, dict): + self.token2wav_config = Qwen2_5OmniToken2WavConfig(**self.token2wav_config) - self.thinker_config = Qwen2_5OmniThinkerConfig(**thinker_config) - self.talker_config = Qwen2_5OmniTalkerConfig(**talker_config) - self.token2wav_config = Qwen2_5OmniToken2WavConfig(**token2wav_config) - self.enable_audio_output = enable_audio_output - - super().__init__(**kwargs) + super().__post_init__(**kwargs) def get_text_config(self, *args, **kwargs): """ diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py index de8e0050be89..e2af2d1e5308 100644 --- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py @@ -1076,8 +1076,8 @@ def forward(self, seqlen: int) -> torch.Tensor: class Qwen2_5_VisionPatchEmbed(nn.Module): def __init__( self, - patch_size: int = 14, - temporal_patch_size: int = 2, + patch_size: int | list[int] | tuple[int, int] = 14, + temporal_patch_size: int | list[int] | tuple[int, int] = 2, in_channels: int = 3, embed_dim: int = 1152, ) -> None: diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 43c83784de05..c27da0c1a478 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -22,12 +22,13 @@ import numpy as np import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import nn from torch.nn import Parameter from ... import initialization as init from ...cache_utils import Cache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutputWithPooling, ModelOutput from ...modeling_rope_utils import RopeParameters @@ -66,6 +67,7 @@ @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniVisionEncoderConfig(Qwen2_5_VLVisionConfig): r""" fullatt_block_indexes (`int`, *optional*, defaults to `[7, 15, 23, 31]`): @@ -92,43 +94,11 @@ class Qwen2_5OmniVisionEncoderConfig(Qwen2_5_VLVisionConfig): model_type = "qwen2_5_omni_vision_encoder" - def __init__( - self, - depth=32, - hidden_size=3584, - hidden_act="silu", - intermediate_size=3420, - num_heads=16, - in_channels=3, - patch_size=14, - spatial_merge_size=2, - temporal_patch_size=2, - window_size=112, - out_hidden_size=3584, - fullatt_block_indexes=[7, 15, 23, 31], - initializer_range=0.02, - **kwargs, - ): - super().__init__( - depth, - hidden_size, - hidden_act, - intermediate_size, - num_heads, - in_channels, - patch_size, - spatial_merge_size, - temporal_patch_size, - window_size, - out_hidden_size, - fullatt_block_indexes, - initializer_range=initializer_range, - **kwargs, - ) - del self.tokens_per_second + tokens_per_second = AttributeError() @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniAudioEncoderConfig(Qwen2AudioEncoderConfig): r""" max_source_positions (`int`, *optional*, defaults to 1500): @@ -155,45 +125,13 @@ class Qwen2_5OmniAudioEncoderConfig(Qwen2AudioEncoderConfig): model_type = "qwen2_5_omni_audio_encoder" - def __init__( - self, - num_mel_bins=128, - encoder_layers=32, - encoder_attention_heads=20, - encoder_ffn_dim=5120, - d_model=1280, - dropout=0, - attention_dropout=0, - activation_function="gelu", - activation_dropout=0, - scale_embedding=False, - initializer_range=0.02, - max_source_positions=1500, - n_window=100, - output_dim=3584, - **kwargs, - ): - super().__init__( - num_mel_bins, - encoder_layers, - encoder_attention_heads, - encoder_ffn_dim, - d_model, - dropout, - attention_dropout, - activation_function, - activation_dropout, - scale_embedding, - initializer_range, - max_source_positions, - **kwargs, - ) - self.n_window = n_window - self.output_dim = output_dim - del self.encoder_layerdrop + n_window: int = 100 + output_dim: int = 3584 + encoder_layerdrop = AttributeError() @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniTextConfig(PreTrainedConfig): r""" max_window_layers (`int`, *optional*, defaults to 28): @@ -240,58 +178,35 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + ignore_keys_at_rope_validation = {"mrope_section"} + + vocab_size: int = 152064 + hidden_size: int = 3584 + intermediate_size: int = 18944 + num_hidden_layers: int = 28 + num_attention_heads: int = 28 + num_key_value_heads: int | None = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + rope_parameters: RopeParameters | dict | None = None + use_sliding_window: bool = False + sliding_window: int | None = 32768 + max_window_layers: int = 28 + layer_types: list[str] | None = None + attention_dropout: float | int = 0.0 + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.sliding_window = self.sliding_window if self.use_sliding_window else None + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - def __init__( - self, - vocab_size: int | None = 152064, - hidden_size: int | None = 3584, - intermediate_size: int | None = 18944, - num_hidden_layers: int | None = 28, - num_attention_heads: int | None = 28, - num_key_value_heads: int | None = 4, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - use_sliding_window: bool | None = False, - sliding_window: int | None = 32768, - max_window_layers: int | None = 28, - layer_types: list[str] | None = None, - attention_dropout: float | None = 0.0, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window if self.use_sliding_window else None - self.max_window_layers = max_window_layers - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - - self.layer_types = layer_types if self.layer_types is None: self.layer_types = [ "sliding_attention" @@ -299,16 +214,12 @@ def __init__( else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__( - ignore_keys_at_rope_validation={"mrope_section"}, - **kwargs, - ) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniThinkerConfig(PreTrainedConfig): r""" position_id_per_seconds (`int`, *optional*, defaults to 25): @@ -358,56 +269,41 @@ class Qwen2_5OmniThinkerConfig(PreTrainedConfig): "text_config": Qwen2_5OmniTextConfig, } - def __init__( - self, - audio_config=None, - vision_config=None, - text_config=None, - audio_token_index=151646, - image_token_index=151655, - video_token_index=151656, - position_id_per_seconds=25, - seconds_per_chunk=2, - audio_start_token_id=151647, - audio_end_token_id=151648, - user_token_id=872, - initializer_range=0.02, - tie_word_embeddings=False, - **kwargs, - ): - self.audio_token_index = audio_token_index - self.image_token_index = image_token_index - self.video_token_index = video_token_index - self.user_token_id = user_token_id - self.position_id_per_seconds = position_id_per_seconds - self.seconds_per_chunk = seconds_per_chunk - self.audio_start_token_id = audio_start_token_id - self.audio_end_token_id = audio_end_token_id - self.initializer_range = initializer_range - self.tie_word_embeddings = tie_word_embeddings - - if isinstance(vision_config, dict): - vision_config = Qwen2_5OmniVisionEncoderConfig(**vision_config) - elif vision_config is None: - vision_config = Qwen2_5OmniVisionEncoderConfig() - self.vision_config = vision_config - - if isinstance(audio_config, dict): - audio_config = Qwen2_5OmniAudioEncoderConfig(**audio_config) - elif audio_config is None: - audio_config = Qwen2_5OmniAudioEncoderConfig() - self.audio_config = audio_config - - if isinstance(text_config, dict): - text_config = Qwen2_5OmniTextConfig(**text_config) - elif text_config is None: - text_config = Qwen2_5OmniTextConfig() - self.text_config = text_config - - super().__init__(**kwargs) + audio_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + audio_token_index: int = 151646 + image_token_index: int = 151655 + video_token_index: int = 151656 + position_id_per_seconds: int = 25 + seconds_per_chunk: int = 2 + audio_start_token_id: int = 151647 + audio_end_token_id: int = 151648 + user_token_id: int = 872 + initializer_range: float = 0.02 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = Qwen2_5OmniVisionEncoderConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = Qwen2_5OmniVisionEncoderConfig() + + if isinstance(self.audio_config, dict): + self.audio_config = Qwen2_5OmniAudioEncoderConfig(**self.audio_config) + elif self.audio_config is None: + self.audio_config = Qwen2_5OmniAudioEncoderConfig() + + if isinstance(self.text_config, dict): + self.text_config = Qwen2_5OmniTextConfig(**self.text_config) + elif self.text_config is None: + self.text_config = Qwen2_5OmniTextConfig() + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniTalkerConfig(PreTrainedConfig): r""" tts_text_start_token_id (`int`, *optional*, defaults to 151860): @@ -464,97 +360,53 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig): "video_token_id": "video_token_index", "audio_token_id": "audio_token_index", } + ignore_keys_at_rope_validation = {"mrope_section"} + + audio_token_index: int = 151646 + image_token_index: int = 151655 + video_token_index: int = 151656 + vocab_size: int = 8448 + tts_text_start_token_id: int = 151860 + tts_text_end_token_id: int = 151861 + tts_text_pad_token_id: int = 151859 + tts_codec_start_token_id: int = 8293 + tts_codec_end_token_id: int = 8294 + tts_codec_pad_token_id: int = 8292 + tts_codec_mask_token_id: int = 8296 + vision_start_token_id: int = 151652 + vision_end_token_id: int = 151653 + embedding_size: int = 3584 + hidden_size: int = 3584 + intermediate_size: int = 18944 + num_hidden_layers: int = 28 + num_attention_heads: int = 28 + num_key_value_heads: int = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + rms_norm_eps: float = 1e-06 + head_dim: int = 128 + use_cache: bool = True + tie_word_embeddings: bool = False + use_sliding_window: bool = False + sliding_window: int | None = 32768 + max_window_layers: int = 28 + attention_dropout: float | int = 0.0 + rope_parameters: RopeParameters | dict | None = None + position_id_per_seconds: int = 25 + seconds_per_chunk: int = 2 + audio_start_token_id: int = 151647 + audio_end_token_id: int = 151648 + initializer_range: float = 0.02 + spatial_merge_size: int = 2 + layer_types: list[str] | None = None + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + self.sliding_window = self.sliding_window if self.use_sliding_window else None + + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - def __init__( - self, - audio_token_index=151646, - image_token_index=151655, - video_token_index=151656, - vocab_size=8448, - tts_text_start_token_id=151860, - tts_text_end_token_id=151861, - tts_text_pad_token_id=151859, - tts_codec_start_token_id=8293, - tts_codec_end_token_id=8294, - tts_codec_pad_token_id=8292, - tts_codec_mask_token_id=8296, - vision_start_token_id=151652, - vision_end_token_id=151653, - embedding_size=3584, - hidden_size=3584, - intermediate_size=18944, - num_hidden_layers=28, - num_attention_heads=28, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=32768, - rms_norm_eps=1e-06, - head_dim=128, - use_cache=True, - tie_word_embeddings=False, - use_sliding_window=False, - sliding_window=32768, - max_window_layers=28, - attention_dropout=0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - position_id_per_seconds=25, - seconds_per_chunk=2, - audio_start_token_id=151647, - audio_end_token_id=151648, - initializer_range=0.02, - spatial_merge_size=2, - layer_types=None, - pad_token_id: int | None = None, - **kwargs, - ): - self.audio_token_index = audio_token_index - self.image_token_index = image_token_index - self.video_token_index = video_token_index - - self.tts_text_start_token_id = tts_text_start_token_id - self.tts_text_end_token_id = tts_text_end_token_id - self.tts_text_pad_token_id = tts_text_pad_token_id - self.tts_codec_start_token_id = tts_codec_start_token_id - self.tts_codec_end_token_id = tts_codec_end_token_id - self.tts_codec_pad_token_id = tts_codec_pad_token_id - - self.tts_codec_mask_token_id = tts_codec_mask_token_id - - self.vision_start_token_id = vision_start_token_id - self.vision_end_token_id = vision_end_token_id - - self.vocab_size = vocab_size - self.head_dim = head_dim - self.embedding_size = embedding_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window if self.use_sliding_window else None - self.max_window_layers = max_window_layers - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.position_id_per_seconds = position_id_per_seconds # zf - self.seconds_per_chunk = seconds_per_chunk # zf - self.audio_start_token_id = audio_start_token_id # zf - self.audio_end_token_id = audio_end_token_id # zf - self.pad_token_id = pad_token_id - - self.initializer_range = initializer_range - self.spatial_merge_size = spatial_merge_size - self.tie_word_embeddings = tie_word_embeddings - - self.layer_types = layer_types if self.layer_types is None: self.layer_types = [ "sliding_attention" @@ -562,13 +414,12 @@ def __init__( else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniDiTConfig(PreTrainedConfig): r""" ff_mult (`int`, *optional*, defaults to 2): @@ -607,61 +458,33 @@ class Qwen2_5OmniDiTConfig(PreTrainedConfig): model_type = "qwen2_5_omni_dit" - def __init__( - self, - hidden_size=1024, - num_hidden_layers=22, - num_attention_heads=16, - ff_mult=2, - emb_dim=512, - head_dim=64, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - max_position_embeddings=32768, - block_size=24, - look_ahead_layers=[10], - look_backward_layers=[0, 20], - repeats=2, - num_embeds=8193, - mel_dim=80, - dropout=0.1, - enc_emb_dim=192, - enc_dim=128, - enc_channels=[256, 256, 256, 256, 768], - enc_kernel_sizes=[5, 3, 3, 3, 1], - enc_dilations=[1, 2, 3, 4, 1], - enc_attention_channels=64, - enc_res2net_scale=2, - enc_se_channels=64, - **kwargs, - ): - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.ff_mult = ff_mult - self.emb_dim = emb_dim - self.head_dim = head_dim - self.max_position_embeddings = max_position_embeddings - self.block_size = block_size - self.look_ahead_layers = look_ahead_layers - self.look_backward_layers = look_backward_layers - self.repeats = repeats - self.num_embeds = num_embeds - self.mel_dim = mel_dim - self.dropout = dropout - self.enc_emb_dim = enc_emb_dim - self.enc_dim = enc_dim - self.enc_channels = enc_channels - self.enc_kernel_sizes = enc_kernel_sizes - self.enc_dilations = enc_dilations - self.enc_attention_channels = enc_attention_channels - self.enc_res2net_scale = enc_res2net_scale - self.enc_se_channels = enc_se_channels - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + hidden_size: int = 1024 + num_hidden_layers: int = 22 + num_attention_heads: int = 16 + ff_mult: int = 2 + emb_dim: int = 512 + head_dim: int = 64 + rope_parameters: RopeParameters | dict | None = None + max_position_embeddings: int = 32768 + block_size: int = 24 + look_ahead_layers: list[int] | tuple[int, ...] = (10,) + look_backward_layers: list[int] | tuple[int, ...] = (0, 20) + repeats: int = 2 + num_embeds: int = 8193 + mel_dim: int = 80 + dropout: float | int = 0.1 + enc_emb_dim: int = 192 + enc_dim: int = 128 + enc_channels: list[int] | tuple[int, ...] = (256, 256, 256, 256, 768) + enc_kernel_sizes: list[int] | tuple[int, ...] = (5, 3, 3, 3, 1) + enc_dilations: list[int] | tuple[int, ...] = (1, 2, 3, 4, 1) + enc_attention_channels: int = 64 + enc_res2net_scale: int = 2 + enc_se_channels: int = 64 @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniBigVGANConfig(PreTrainedConfig): r""" mel_dim (`int`, *optional*, defaults to 80): @@ -680,26 +503,16 @@ class Qwen2_5OmniBigVGANConfig(PreTrainedConfig): model_type = "qwen2_5_omni_bigvgan" - def __init__( - self, - mel_dim=80, - upsample_initial_channel=1536, - resblock_kernel_sizes=[3, 7, 11], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], - upsample_rates=[5, 3, 2, 2, 2, 2], - upsample_kernel_sizes=[11, 7, 4, 4, 4, 4], - **kwargs, - ): - self.mel_dim = mel_dim - self.upsample_initial_channel = upsample_initial_channel - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_kernel_sizes = upsample_kernel_sizes - super().__init__(**kwargs) + mel_dim: int = 80 + upsample_initial_channel: int = 1536 + resblock_kernel_sizes: list[int] | tuple[int, ...] = (3, 7, 11) + resblock_dilation_sizes: list | tuple = ((1, 3, 5), (1, 3, 5), (1, 3, 5)) + upsample_rates: list[int] | tuple[int, ...] = (5, 3, 2, 2, 2, 2) + upsample_kernel_sizes: list[int] | tuple[int, ...] = (11, 7, 4, 4, 4, 4) @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniToken2WavConfig(PreTrainedConfig): r""" dit_config ([`DiT_Args`], *optional*): @@ -743,17 +556,25 @@ class Qwen2_5OmniToken2WavConfig(PreTrainedConfig): "bigvgan_config": Qwen2_5OmniBigVGANConfig, } - def __init__(self, dit_config=None, bigvgan_config=None, **kwargs): - if dit_config is None: - dit_config = {} - if bigvgan_config is None: - bigvgan_config = {} - self.dit_config = Qwen2_5OmniDiTConfig(**dit_config) - self.bigvgan_config = Qwen2_5OmniBigVGANConfig(**bigvgan_config) - super().__init__(**kwargs) + dit_config: dict | PreTrainedConfig | None = None + bigvgan_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): + if self.dit_config is None: + self.dit_config = Qwen2_5OmniDiTConfig() + elif isinstance(self.dit_config, dict): + self.dit_config = Qwen2_5OmniDiTConfig(**self.dit_config) + + if self.bigvgan_config is None: + self.bigvgan_config = Qwen2_5OmniBigVGANConfig() + elif isinstance(self.bigvgan_config, dict): + self.bigvgan_config = Qwen2_5OmniBigVGANConfig(**self.bigvgan_config) + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen2_5OmniConfig(PreTrainedConfig): """ thinker_config (`dict`, *optional*): Configuration of the underlying thinker sub-model. @@ -798,32 +619,31 @@ class Qwen2_5OmniConfig(PreTrainedConfig): "token2wav_config": Qwen2_5OmniToken2WavConfig, } - def __init__( - self, - thinker_config=None, - talker_config=None, - token2wav_config=None, - enable_audio_output: bool = True, - **kwargs, - ): - if thinker_config is None: - thinker_config = {} + thinker_config: dict | PreTrainedConfig | None = None + talker_config: dict | PreTrainedConfig | None = None + token2wav_config: dict | PreTrainedConfig | None = None + enable_audio_output: bool = True + + def __post_init__(self, **kwargs): + if self.thinker_config is None: + self.thinker_config = Qwen2_5OmniThinkerConfig() logger.info("thinker_config is None. Initializing thinker model with default values") + elif isinstance(self.thinker_config, dict): + self.thinker_config = Qwen2_5OmniThinkerConfig(**self.thinker_config) - if talker_config is None: - talker_config = {} + if self.talker_config is None: + self.talker_config = Qwen2_5OmniTalkerConfig() logger.info("talker_config is None. Initializing talker model with default values") + elif isinstance(self.talker_config, dict): + self.talker_config = Qwen2_5OmniTalkerConfig(**self.talker_config) - if token2wav_config is None: - token2wav_config = {} + if self.token2wav_config is None: + self.token2wav_config = Qwen2_5OmniToken2WavConfig() logger.info("token2wav_config is None. Initializing token2wav model with default values") + elif isinstance(self.token2wav_config, dict): + self.token2wav_config = Qwen2_5OmniToken2WavConfig(**self.token2wav_config) - self.thinker_config = Qwen2_5OmniThinkerConfig(**thinker_config) - self.talker_config = Qwen2_5OmniTalkerConfig(**talker_config) - self.token2wav_config = Qwen2_5OmniToken2WavConfig(**token2wav_config) - self.enable_audio_output = enable_audio_output - - super().__init__(**kwargs) + super().__post_init__(**kwargs) def get_text_config(self, *args, **kwargs): """ diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index e05bb859064e..18e7598f9eda 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -24,12 +24,15 @@ # limitations under the License. import inspect -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="Qwen/Qwen2-VL-7B-Instruct") +@strict(accept_kwargs=True) class Qwen2_5_VLVisionConfig(PreTrainedConfig): r""" tokens_per_second (`int`, *optional*, defaults to 41): @@ -45,43 +48,24 @@ class Qwen2_5_VLVisionConfig(PreTrainedConfig): model_type = "qwen2_5_vl" base_config_key = "vision_config" - def __init__( - self, - depth=32, - hidden_size=3584, - hidden_act="silu", - intermediate_size=3420, - num_heads=16, - in_channels=3, - patch_size=14, - spatial_merge_size=2, - temporal_patch_size=2, - tokens_per_second=4, - window_size=112, - out_hidden_size=3584, - fullatt_block_indexes=[7, 15, 23, 31], - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.tokens_per_second = tokens_per_second - self.window_size = window_size - self.fullatt_block_indexes = fullatt_block_indexes - self.out_hidden_size = out_hidden_size - self.initializer_range = initializer_range + depth: int = 32 + hidden_size: int = 3584 + hidden_act: str = "silu" + intermediate_size: int = 3420 + num_heads: int = 16 + in_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 14 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + tokens_per_second: int = 4 + window_size: int = 112 + out_hidden_size: int = 3584 + fullatt_block_indexes: list[int] | tuple[int, ...] = (7, 15, 23, 31) + initializer_range: float = 0.02 @auto_docstring(checkpoint="Qwen/Qwen2-VL-7B-Instruct") +@strict(accept_kwargs=True) class Qwen2_5_VLTextConfig(PreTrainedConfig): r""" max_window_layers (`int`, *optional*, defaults to 80): @@ -120,53 +104,36 @@ class Qwen2_5_VLTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - - def __init__( - self, - vocab_size: int | None = 152064, - hidden_size: int | None = 8192, - intermediate_size: int | None = 29568, - num_hidden_layers: int | None = 80, - num_attention_heads: int | None = 64, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-05, - use_cache: bool | None = True, - use_sliding_window: bool | None = False, - sliding_window: int | None = 4096, - max_window_layers: int | None = 80, - layer_types: list[str] | None = None, - attention_dropout: float | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - bos_token_id: int | None = 151643, - eos_token_id: int | None = 151645, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window if self.use_sliding_window else None - self.max_window_layers = max_window_layers + ignore_keys_at_rope_validation = {"mrope_section"} + + vocab_size: int = 152064 + hidden_size: int = 8192 + intermediate_size: int = 29568 + num_hidden_layers: int = 80 + num_attention_heads: int = 64 + num_key_value_heads: int | None = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + use_cache: bool = True + use_sliding_window: bool | None = False + sliding_window: int | None = 4096 + max_window_layers: int | None = 80 + layer_types: list[str] | None = None + attention_dropout: float | int | None = 0.0 + rope_parameters: RopeParameters | dict | None = None + bos_token_id: int | None = 151643 + eos_token_id: int | list[int] | None = 151645 + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + self.sliding_window = self.sliding_window if self.use_sliding_window else None # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - - self.layer_types = layer_types if self.layer_types is None: self.layer_types = [ "sliding_attention" @@ -174,18 +141,10 @@ def __init__( else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.rope_parameters = rope_parameters - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - super().__init__( - ignore_keys_at_rope_validation={"mrope_section"}, - **kwargs, - ) - - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None = None, **kwargs): + + super().__post_init__(**kwargs) + + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} @@ -195,11 +154,11 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None if self.rope_parameters.get("rope_type", self.rope_parameters.get("type")) == "mrope": self.rope_parameters["rope_type"] = "default" self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs @auto_docstring(checkpoint="Qwen/Qwen2-VL-7B-Instruct") +@strict(accept_kwargs=True) class Qwen2_5_VLConfig(PreTrainedConfig): r""" Example: @@ -221,38 +180,33 @@ class Qwen2_5_VLConfig(PreTrainedConfig): sub_configs = {"vision_config": Qwen2_5_VLVisionConfig, "text_config": Qwen2_5_VLTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151655, - video_token_id=151656, - vision_start_token_id=151652, - vision_end_token_id=151653, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151655 + video_token_id: int = 151656 + vision_start_token_id: int = 151652 + vision_end_token_id: int = 151653 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + # Hub configs are saved as flat dicts so we pop some of kwargs to init `TextConfig` + text_params = inspect.signature(self.sub_configs["text_config"].__init__).parameters.keys() + text_params = list(text_params) + ["rope_parameters", "rope_scaling", "rope_theta"] + text_kwargs = {key: kwargs.pop(key) for key in text_params if key in kwargs} + + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: # Hub configs are saved as flat dicts so we pop some of kwargs to init `TextConfig` - text_params = inspect.signature(self.sub_configs["text_config"].__init__).parameters.keys() - text_params = list(text_params) + ["rope_scaling", "rope_theta"] - text_config = {key: kwargs.pop(key) for key in text_params if key in kwargs} - text_config["dtype"] = kwargs.get("torch_dtype", kwargs.get("dtype")) # don't pop the dtype - self.text_config = self.sub_configs["text_config"](**text_config) - - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.vision_start_token_id = vision_start_token_id - self.vision_end_token_id = vision_end_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + text_kwargs["dtype"] = kwargs.get("torch_dtype", kwargs.get("dtype")) # don't pop the dtype + self.text_config = self.sub_configs["text_config"](**text_kwargs) + + super().__post_init__(**kwargs) __all__ = ["Qwen2_5_VLConfig", "Qwen2_5_VLTextConfig"] diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 6e3acc5466e7..36e7fc7161d0 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -91,8 +91,8 @@ def forward(self, hidden_state): class Qwen2_5_VisionPatchEmbed(nn.Module): def __init__( self, - patch_size: int = 14, - temporal_patch_size: int = 2, + patch_size: int | list[int] | tuple[int, int] = 14, + temporal_patch_size: int | list[int] | tuple[int, int] = 2, in_channels: int = 3, embed_dim: int = 1152, ) -> None: diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 2638476cf979..b84e1b14ce86 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -24,6 +24,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...activations import ACT2FN @@ -61,6 +62,7 @@ @auto_docstring(checkpoint="Qwen/Qwen2-VL-7B-Instruct") +@strict(accept_kwargs=True) class Qwen2_5_VLVisionConfig(PreTrainedConfig): r""" tokens_per_second (`int`, *optional*, defaults to 41): @@ -76,49 +78,28 @@ class Qwen2_5_VLVisionConfig(PreTrainedConfig): model_type = "qwen2_5_vl" base_config_key = "vision_config" - def __init__( - self, - depth=32, - hidden_size=3584, - hidden_act="silu", - intermediate_size=3420, - num_heads=16, - in_channels=3, - patch_size=14, - spatial_merge_size=2, - temporal_patch_size=2, - tokens_per_second=4, - window_size=112, - out_hidden_size=3584, - fullatt_block_indexes=[7, 15, 23, 31], - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.tokens_per_second = tokens_per_second - self.window_size = window_size - self.fullatt_block_indexes = fullatt_block_indexes - self.out_hidden_size = out_hidden_size - self.initializer_range = initializer_range + depth: int = 32 + hidden_size: int = 3584 + hidden_act: str = "silu" + intermediate_size: int = 3420 + num_heads: int = 16 + in_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 14 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + tokens_per_second: int = 4 + window_size: int = 112 + out_hidden_size: int = 3584 + fullatt_block_indexes: list[int] | tuple[int, ...] = (7, 15, 23, 31) + initializer_range: float = 0.02 class Qwen2_5_VLTextConfig(Qwen2VLTextConfig): - model_type = "qwen2_5_vl_text" + pass class Qwen2_5_VLConfig(Qwen2VLConfig): - model_type = "qwen2_5_vl" - sub_configs = {"vision_config": Qwen2_5_VLVisionConfig, "text_config": Qwen2_5_VLTextConfig} + pass class Qwen2_5_VLRMSNorm(LlamaRMSNorm): diff --git a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py index 91ba383b5e96..657b45bd81de 100644 --- a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py @@ -12,15 +12,15 @@ # limitations under the License. """Qwen2Audio model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="Qwen/Qwen2-Audio-7B") +@strict(accept_kwargs=True) class Qwen2AudioEncoderConfig(PreTrainedConfig): r""" max_source_positions (`int`, *optional*, defaults to 1500): @@ -42,43 +42,25 @@ class Qwen2AudioEncoderConfig(PreTrainedConfig): ```""" model_type = "qwen2_audio_encoder" - - def __init__( - self, - num_mel_bins=128, - encoder_layers=32, - encoder_attention_heads=20, - encoder_ffn_dim=5120, - encoder_layerdrop=0.0, - d_model=1280, - dropout=0.0, - attention_dropout=0.0, - activation_function="gelu", - activation_dropout=0.0, - scale_embedding=False, - initializer_range=0.02, - max_source_positions=1500, - **kwargs, - ): - super().__init__(**kwargs) - - self.num_mel_bins = num_mel_bins - self.d_model = d_model - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.encoder_ffn_dim = encoder_ffn_dim - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_function = activation_function - self.activation_dropout = activation_dropout - self.encoder_layerdrop = encoder_layerdrop - self.num_hidden_layers = encoder_layers - self.initializer_range = initializer_range - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.max_source_positions = max_source_positions + attribute_map = {"num_hidden_layers": "encoder_layers"} + + num_mel_bins: int = 128 + encoder_layers: int = 32 + encoder_attention_heads: int = 20 + encoder_ffn_dim: int = 5120 + encoder_layerdrop: float | int = 0.0 + d_model: int = 1280 + dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + activation_function: str = "gelu" + activation_dropout: float | int = 0.0 + scale_embedding: bool = False + initializer_range: float = 0.02 + max_source_positions: int = 1500 @auto_docstring(checkpoint="Qwen/Qwen2-Audio-7B") +@strict(accept_kwargs=True) class Qwen2AudioConfig(PreTrainedConfig): r""" Example: @@ -108,20 +90,16 @@ class Qwen2AudioConfig(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "audio_config": AutoConfig} - def __init__( - self, - audio_config=None, - text_config=None, - audio_token_index=151646, - **kwargs, - ): - self.audio_token_index = audio_token_index - - if isinstance(audio_config, dict): - audio_config["model_type"] = audio_config.get("model_type", "qwen2_audio_encoder") - audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config) - elif audio_config is None: - audio_config = CONFIG_MAPPING["qwen2_audio_encoder"]( + audio_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + audio_token_index: int = 151646 + + def __post_init__(self, **kwargs): + if isinstance(self.audio_config, dict): + self.audio_config["model_type"] = self.audio_config.get("model_type", "qwen2_audio_encoder") + self.audio_config = CONFIG_MAPPING[self.audio_config["model_type"]](**self.audio_config) + elif self.audio_config is None: + self.audio_config = CONFIG_MAPPING["qwen2_audio_encoder"]( d_model=1280, encoder_attention_heads=20, encoder_ffn_dim=5120, @@ -133,17 +111,13 @@ def __init__( activation_function="gelu", ) - self.audio_config = audio_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "qwen2") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["qwen2"]() - - self.text_config = text_config + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "qwen2") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["qwen2"]() - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Qwen2AudioConfig", "Qwen2AudioEncoderConfig"] diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index e0b2518a3833..a96bbd40b617 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -13,15 +13,15 @@ # limitations under the License. """Qwen2MoE model configuration""" -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters +from ...utils import auto_docstring @auto_docstring(checkpoint="Qwen/Qwen1.5-MoE-A2.7B") +@strict(accept_kwargs=True) class Qwen2MoeConfig(PreTrainedConfig): r""" max_window_layers (`int`, *optional*, defaults to 28): @@ -70,86 +70,50 @@ class Qwen2MoeConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 151936, - hidden_size: int | None = 2048, - intermediate_size: int | None = 5632, - num_hidden_layers: int | None = 24, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 16, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - use_sliding_window: bool | None = False, - sliding_window: int | None = 4096, - max_window_layers: int | None = 28, - attention_dropout: float | None = 0.0, - decoder_sparse_step: int | None = 1, - moe_intermediate_size: int | None = 1408, - shared_expert_intermediate_size: int | None = 5632, - num_experts_per_tok: int | None = 4, - num_experts: int | None = 60, - norm_topk_prob: bool | None = False, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - mlp_only_layers: bool | None = None, - qkv_bias: bool | None = True, - layer_types: list[str] | None = None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.layer_types = layer_types - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window if use_sliding_window else 0 - self.max_window_layers = max_window_layers - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - - # MoE arguments - self.decoder_sparse_step = decoder_sparse_step - self.moe_intermediate_size = moe_intermediate_size - self.shared_expert_intermediate_size = shared_expert_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.norm_topk_prob = norm_topk_prob - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers - self.qkv_bias = qkv_bias + vocab_size: int = 151936 + hidden_size: int = 2048 + intermediate_size: int = 5632 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + num_key_value_heads: int | None = 16 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + use_sliding_window: bool = False + sliding_window: int | None = 4096 + max_window_layers: int = 28 + attention_dropout: float | int = 0.0 + decoder_sparse_step: int = 1 + moe_intermediate_size: int = 1408 + shared_expert_intermediate_size: int = 5632 + num_experts_per_tok: int = 4 + num_experts: int = 60 + norm_topk_prob: bool = False + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + mlp_only_layers: list[int] | None = None + qkv_bias: bool = True + layer_types: list[str] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): + self.mlp_only_layers = [] if self.mlp_only_layers is None else self.mlp_only_layers + self.sliding_window = self.sliding_window if self.use_sliding_window else 0 if self.layer_types is None: self.layer_types = [ "sliding_attention" - if bool((i + 1) % 2) and i < self.max_window_layers and use_sliding_window + if bool((i + 1) % 2) and i < self.max_window_layers and self.use_sliding_window else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Qwen2MoeConfig"] diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index afc8b1102edd..c1390f2d5803 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -15,50 +15,34 @@ import inspect -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters +from ...utils import auto_docstring @auto_docstring(checkpoint="Qwen/Qwen2-VL-7B-Instruct") +@strict(accept_kwargs=True) class Qwen2VLVisionConfig(PreTrainedConfig): model_type = "qwen2_vl" base_config_key = "vision_config" - def __init__( - self, - depth=32, - embed_dim=1280, - hidden_size=3584, - hidden_act="quick_gelu", - mlp_ratio=4, - num_heads=16, - in_channels=3, - patch_size=14, - spatial_merge_size=2, - temporal_patch_size=2, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.embed_dim = embed_dim - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.mlp_ratio = mlp_ratio - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.initializer_range = initializer_range + depth: int = 32 + embed_dim: int = 1280 + hidden_size: int = 3584 + hidden_act: str = "quick_gelu" + mlp_ratio: int = 4 + num_heads: int = 16 + in_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 14 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + initializer_range: float = 0.02 @auto_docstring(checkpoint="Qwen/Qwen2-VL-7B-Instruct") +@strict(accept_kwargs=True) class Qwen2VLTextConfig(PreTrainedConfig): r""" max_window_layers (`int`, *optional*, defaults to 80): @@ -97,53 +81,36 @@ class Qwen2VLTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - - def __init__( - self, - vocab_size: int | None = 152064, - hidden_size: int | None = 8192, - intermediate_size: int | None = 29568, - num_hidden_layers: int | None = 80, - num_attention_heads: int | None = 64, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-05, - use_cache: bool | None = True, - use_sliding_window: bool | None = False, - sliding_window: int | None = 4096, - max_window_layers: int | None = 80, - layer_types: list[str] | None = None, - attention_dropout: float | None = 0.0, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - bos_token_id: int | None = 151643, - eos_token_id: int | None = 151645, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window if self.use_sliding_window else None - self.max_window_layers = max_window_layers + ignore_keys_at_rope_validation = {"mrope_section"} + + vocab_size: int = 152064 + hidden_size: int = 8192 + intermediate_size: int = 29568 + num_hidden_layers: int = 80 + num_attention_heads: int = 64 + num_key_value_heads: int | None = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-05 + use_cache: bool = True + use_sliding_window: bool | None = False + sliding_window: int | None = 4096 + max_window_layers: int | None = 80 + layer_types: list[str] | None = None + attention_dropout: float | int | None = 0.0 + rope_parameters: RopeParameters | dict | None = None + bos_token_id: int | None = 151643 + eos_token_id: int | list[int] | None = 151645 + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + self.sliding_window = self.sliding_window if self.use_sliding_window else None # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.layer_types = layer_types if self.layer_types is None: self.layer_types = [ "sliding_attention" @@ -151,18 +118,10 @@ def __init__( else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.rope_parameters = rope_parameters - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - super().__init__( - ignore_keys_at_rope_validation={"mrope_section"}, - **kwargs, - ) - - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None = None, **kwargs): + + super().__post_init__(**kwargs) + + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} @@ -172,11 +131,11 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None if self.rope_parameters.get("rope_type", self.rope_parameters.get("type")) == "mrope": self.rope_parameters["rope_type"] = "default" self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs @auto_docstring(checkpoint="Qwen/Qwen2-VL-7B-Instruct") +@strict(accept_kwargs=True) class Qwen2VLConfig(PreTrainedConfig): r""" Example: @@ -198,38 +157,33 @@ class Qwen2VLConfig(PreTrainedConfig): sub_configs = {"vision_config": Qwen2VLVisionConfig, "text_config": Qwen2VLTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151655, - video_token_id=151656, - vision_start_token_id=151652, - vision_end_token_id=151653, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151655 + video_token_id: int = 151656 + vision_start_token_id: int = 151652 + vision_end_token_id: int = 151653 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + # Hub configs are saved as flat dicts so we pop some of kwargs to init `TextConfig` + text_params = inspect.signature(self.sub_configs["text_config"].__init__).parameters.keys() + text_params = list(text_params) + ["rope_parameters", "rope_scaling", "rope_theta"] + text_kwargs = {key: kwargs.pop(key) for key in text_params if key in kwargs} + + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: # Hub configs are saved as flat dicts so we pop some of kwargs to init `TextConfig` - text_params = inspect.signature(self.sub_configs["text_config"].__init__).parameters.keys() - text_params = list(text_params) + ["rope_scaling", "rope_theta"] - text_config = {key: kwargs.pop(key) for key in text_params if key in kwargs} - text_config["dtype"] = kwargs.get("torch_dtype", kwargs.get("dtype")) # don't pop the dtype - self.text_config = self.sub_configs["text_config"](**text_config) - - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.vision_start_token_id = vision_start_token_id - self.vision_end_token_id = vision_end_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + text_kwargs["dtype"] = kwargs.get("torch_dtype", kwargs.get("dtype")) # don't pop the dtype + self.text_config = self.sub_configs["text_config"](**text_kwargs) + + super().__post_init__(**kwargs) __all__ = ["Qwen2VLConfig", "Qwen2VLTextConfig"] diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 2ecb7773581b..1ff54b5cb3d5 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -287,8 +287,8 @@ def forward(self, seqlen: int) -> torch.Tensor: class PatchEmbed(nn.Module): def __init__( self, - patch_size: int = 14, - temporal_patch_size: int = 2, + patch_size: int | list[int] | tuple[int, int] = 14, + temporal_patch_size: int | list[int] | tuple[int, int] = 2, in_channels: int = 3, embed_dim: int = 1152, ) -> None: diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index 709df65a5464..e22d457f36c0 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -13,15 +13,15 @@ # limitations under the License. """Qwen3 model configuration""" -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters +from ...utils import auto_docstring @auto_docstring(checkpoint="Qwen/Qwen3-8B") +@strict(accept_kwargs=True) class Qwen3Config(PreTrainedConfig): r""" max_window_layers (`int`, *optional*, defaults to 28): @@ -62,57 +62,35 @@ class Qwen3Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 151936, - hidden_size: int | None = 4096, - intermediate_size: int | None = 22016, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 32, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-6, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - use_sliding_window: bool | None = False, - sliding_window: int | None = 4096, - max_window_layers: int | None = 28, - layer_types: list[str] | None = None, - attention_dropout: float | None = 0.0, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window if self.use_sliding_window else None - self.max_window_layers = max_window_layers - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + vocab_size: int = 151936 + hidden_size: int = 4096 + intermediate_size: int = 22016 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 32 + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + use_sliding_window: bool = False + sliding_window: int | None = 4096 + max_window_layers: int = 28 + layer_types: list[str] | None = None + attention_dropout: float | int = 0.0 + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): + self.sliding_window = self.sliding_window if self.use_sliding_window else None + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.head_dim = head_dim - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - self.layer_types = layer_types if self.layer_types is None: self.layer_types = [ "sliding_attention" @@ -120,15 +98,7 @@ def __init__( else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Qwen3Config"] diff --git a/src/transformers/models/qwen3_5/configuration_qwen3_5.py b/src/transformers/models/qwen3_5/configuration_qwen3_5.py index a2b739c6629d..609ebb0a2924 100644 --- a/src/transformers/models/qwen3_5/configuration_qwen3_5.py +++ b/src/transformers/models/qwen3_5/configuration_qwen3_5.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="Qwen/Qwen3.5-27B") +@strict(accept_kwargs=True) class Qwen3_5TextConfig(PreTrainedConfig): r""" linear_conv_kernel_dim (`int`, *optional*, defaults to 4): @@ -69,78 +72,49 @@ class Qwen3_5TextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + + vocab_size: int = 248320 + hidden_size: int = 4096 + intermediate_size: int = 12288 + num_hidden_layers: int = 32 + num_attention_heads: int = 16 + num_key_value_heads: int = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + head_dim: int = 256 + linear_conv_kernel_dim: int = 4 + linear_key_head_dim: int = 128 + linear_value_head_dim: int = 128 + linear_num_key_heads: int = 16 + linear_num_value_heads: int = 32 + layer_types: list[str] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None base_config_key = "text_config" + ignore_keys_at_rope_validation = {"mrope_section", "mrope_interleaved"} - def __init__( - self, - vocab_size=248320, - hidden_size=4096, - intermediate_size=12288, - num_hidden_layers=32, - num_attention_heads=16, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias=False, - attention_dropout=0.0, - head_dim=256, - linear_conv_kernel_dim=4, - linear_key_head_dim=128, - linear_value_head_dim=128, - linear_num_key_heads=16, - linear_num_value_heads=32, - layer_types=None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - kwargs["ignore_keys_at_rope_validation"] = {"mrope_section", "mrope_interleaved"} - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.head_dim = head_dim - self.rope_parameters = rope_parameters + def __post_init__(self, **kwargs): kwargs.setdefault("partial_rotary_factor", 0.25) # assign default for BC - - self.layer_types = layer_types if self.layer_types is None: - interval_pattern = kwargs.get("full_attention_interval", 4) + interval_pattern = kwargs.pop("full_attention_interval", 4) self.layer_types = [ "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - # linear attention part - self.linear_conv_kernel_dim = linear_conv_kernel_dim - self.linear_key_head_dim = linear_key_head_dim - self.linear_value_head_dim = linear_value_head_dim - self.linear_num_key_heads = linear_num_key_heads - self.linear_num_value_heads = linear_num_value_heads - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3.5-27B") +@strict(accept_kwargs=True) class Qwen3_5VisionConfig(PreTrainedConfig): r""" num_position_embeddings (`int`, *optional*, defaults to 2304): @@ -154,39 +128,22 @@ class Qwen3_5VisionConfig(PreTrainedConfig): model_type = "qwen3_5" base_config_key = "vision_config" - def __init__( - self, - depth=27, - hidden_size=1152, - hidden_act="gelu_pytorch_tanh", - intermediate_size=4304, - num_heads=16, - in_channels=3, - patch_size=16, - spatial_merge_size=2, - temporal_patch_size=2, - out_hidden_size=3584, - num_position_embeddings=2304, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.out_hidden_size = out_hidden_size - self.num_position_embeddings = num_position_embeddings - self.initializer_range = initializer_range + depth: int = 27 + hidden_size: int = 1152 + hidden_act: str = "gelu_pytorch_tanh" + intermediate_size: int = 4304 + num_heads: int = 16 + in_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 16 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + out_hidden_size: int = 3584 + num_position_embeddings: int = 2304 + initializer_range: float = 0.02 @auto_docstring(checkpoint="Qwen/Qwen3.5-27B") +@strict(accept_kwargs=True) class Qwen3_5Config(PreTrainedConfig): r""" Example: @@ -208,33 +165,27 @@ class Qwen3_5Config(PreTrainedConfig): sub_configs = {"vision_config": Qwen3_5VisionConfig, "text_config": Qwen3_5TextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=248056, - video_token_id=248057, - vision_start_token_id=248053, - vision_end_token_id=248054, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + + image_token_id: int = 248056 + video_token_id: int = 248057 + vision_start_token_id: int = 248053 + vision_end_token_id: int = 248054 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: self.text_config = self.sub_configs["text_config"]() - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.vision_start_token_id = vision_start_token_id - self.vision_end_token_id = vision_end_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Qwen3_5Config", "Qwen3_5TextConfig"] diff --git a/src/transformers/models/qwen3_5/modular_qwen3_5.py b/src/transformers/models/qwen3_5/modular_qwen3_5.py index eabf54703b78..bdd7bb42f0a9 100644 --- a/src/transformers/models/qwen3_5/modular_qwen3_5.py +++ b/src/transformers/models/qwen3_5/modular_qwen3_5.py @@ -17,6 +17,7 @@ import torch import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init @@ -24,7 +25,6 @@ from ...masking_utils import create_causal_mask from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling -from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -57,6 +57,7 @@ @auto_docstring(checkpoint="Qwen/Qwen3.5-27B") +@strict(accept_kwargs=True) class Qwen3_5TextConfig(Qwen3NextConfig): r""" linear_conv_kernel_dim (`int`, *optional*, defaults to 4): @@ -98,77 +99,37 @@ class Qwen3_5TextConfig(Qwen3NextConfig): "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", } - - def __init__( - self, - vocab_size=248320, - hidden_size=4096, - intermediate_size=12288, - num_hidden_layers=32, - num_attention_heads=16, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias=False, - attention_dropout=0.0, - head_dim=256, - linear_conv_kernel_dim=4, - linear_key_head_dim=128, - linear_value_head_dim=128, - linear_num_key_heads=16, - linear_num_value_heads=32, - layer_types=None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - kwargs["ignore_keys_at_rope_validation"] = {"mrope_section", "mrope_interleaved"} - super().__init__( - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - del self.decoder_sparse_step - del self.norm_topk_prob + ignore_keys_at_rope_validation = {"mrope_section", "mrope_interleaved"} + + vocab_size: int = 248320 + hidden_size: int = 4096 + intermediate_size: int = 12288 + num_hidden_layers: int = 32 + num_key_value_heads: int = 4 + + decoder_sparse_step = AttributeError() + norm_topk_prob = AttributeError() + mlp_only_layers = AttributeError() + moe_intermediate_size = AttributeError() + shared_expert_intermediate_size = AttributeError() + num_experts_per_tok = AttributeError() + num_experts = AttributeError() + output_router_logits = AttributeError() + router_aux_loss_coef = AttributeError() + + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) del self.mlp_only_layers - del self.moe_intermediate_size - del self.shared_expert_intermediate_size - del self.num_experts_per_tok - del self.num_experts - del self.output_router_logits - del self.router_aux_loss_coef @auto_docstring(checkpoint="Qwen/Qwen3.5-27B") +@strict(accept_kwargs=True) class Qwen3_5VisionConfig(Qwen3VLVisionConfig): - model_type = "qwen3_5" - - def __init__( - self, - depth=27, - hidden_size=1152, - hidden_act="gelu_pytorch_tanh", - intermediate_size=4304, - num_heads=16, - in_channels=3, - patch_size=16, - spatial_merge_size=2, - temporal_patch_size=2, - out_hidden_size=3584, - num_position_embeddings=2304, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - del self.deepstack_visual_indexes + deepstack_visual_indexes = AttributeError() @auto_docstring(checkpoint="Qwen/Qwen3.5-27B") +@strict(accept_kwargs=True) class Qwen3_5Config(Qwen3VLConfig): r""" Example: @@ -186,30 +147,10 @@ class Qwen3_5Config(Qwen3VLConfig): >>> configuration = model.config ```""" - model_type = "qwen3_5" - sub_configs = {"vision_config": Qwen3_5VisionConfig, "text_config": Qwen3_5TextConfig} - - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=248056, - video_token_id=248057, - vision_start_token_id=248053, - vision_end_token_id=248054, - tie_word_embeddings=False, - **kwargs, - ): - super().__init__( - text_config=text_config, - vision_config=vision_config, - image_token_id=image_token_id, - video_token_id=video_token_id, - vision_start_token_id=vision_start_token_id, - vision_end_token_id=vision_end_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + image_token_id: int = 248056 + video_token_id: int = 248057 + vision_start_token_id: int = 248053 + vision_end_token_id: int = 248054 class Qwen3_5DynamicCache(Qwen3NextDynamicCache): diff --git a/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py index 049673cc1ca2..78c6bb4ee628 100644 --- a/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py +++ b/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="Qwen/Qwen3.5-35B-A3B") +@strict(accept_kwargs=True) class Qwen3_5MoeTextConfig(PreTrainedConfig): r""" linear_conv_kernel_dim (`int`, *optional*, defaults to 4): @@ -72,88 +75,54 @@ class Qwen3_5MoeTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + + vocab_size: int = 248320 + hidden_size: int = 2048 + num_hidden_layers: int = 40 + num_attention_heads: int = 16 + num_key_value_heads: int = 2 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + head_dim: int = 256 + linear_conv_kernel_dim: int = 4 + linear_key_head_dim: int = 128 + linear_value_head_dim: int = 128 + linear_num_key_heads: int = 16 + linear_num_value_heads: int = 32 + moe_intermediate_size: int = 512 + shared_expert_intermediate_size: int = 512 + num_experts_per_tok: int = 8 + num_experts: int = 256 + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + layer_types: list[str] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None base_config_key = "text_config" + ignore_keys_at_rope_validation = {"mrope_section", "mrope_interleaved"} - def __init__( - self, - vocab_size=248320, - hidden_size=2048, - num_hidden_layers=40, - num_attention_heads=16, - num_key_value_heads=2, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias=False, - attention_dropout=0.0, - head_dim=256, - linear_conv_kernel_dim=4, - linear_key_head_dim=128, - linear_value_head_dim=128, - linear_num_key_heads=16, - linear_num_value_heads=32, - moe_intermediate_size=512, - shared_expert_intermediate_size=512, - num_experts_per_tok=8, - num_experts=256, - output_router_logits=False, - router_aux_loss_coef=0.001, - layer_types=None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - kwargs["ignore_keys_at_rope_validation"] = {"mrope_section", "mrope_interleaved"} - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.head_dim = head_dim - self.rope_parameters = rope_parameters + def __post_init__(self, **kwargs): kwargs.setdefault("partial_rotary_factor", 0.25) # assign default for BC - - self.layer_types = layer_types if self.layer_types is None: - interval_pattern = kwargs.get("full_attention_interval", 4) + interval_pattern = kwargs.pop("full_attention_interval", 4) self.layer_types = [ "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - # linear attention part - self.linear_conv_kernel_dim = linear_conv_kernel_dim - self.linear_key_head_dim = linear_key_head_dim - self.linear_value_head_dim = linear_value_head_dim - self.linear_num_key_heads = linear_num_key_heads - self.linear_num_value_heads = linear_num_value_heads - self.moe_intermediate_size = moe_intermediate_size - self.shared_expert_intermediate_size = shared_expert_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - super().__init__(**kwargs) + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3.5-35B-A3B") +@strict(accept_kwargs=True) class Qwen3_5MoeVisionConfig(PreTrainedConfig): r""" num_position_embeddings (`int`, *optional*, defaults to 2304): @@ -167,39 +136,22 @@ class Qwen3_5MoeVisionConfig(PreTrainedConfig): model_type = "qwen3_5_moe" base_config_key = "vision_config" - def __init__( - self, - depth=27, - hidden_size=1152, - hidden_act="gelu_pytorch_tanh", - intermediate_size=4304, - num_heads=16, - in_channels=3, - patch_size=16, - spatial_merge_size=2, - temporal_patch_size=2, - out_hidden_size=3584, - num_position_embeddings=2304, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.out_hidden_size = out_hidden_size - self.num_position_embeddings = num_position_embeddings - self.initializer_range = initializer_range + depth: int = 27 + hidden_size: int = 1152 + hidden_act: str = "gelu_pytorch_tanh" + intermediate_size: int = 4304 + num_heads: int = 16 + in_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 16 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + out_hidden_size: int = 3584 + num_position_embeddings: int = 2304 + initializer_range: float = 0.02 @auto_docstring(checkpoint="Qwen/Qwen3.5-35B-A3B") +@strict(accept_kwargs=True) class Qwen3_5MoeConfig(PreTrainedConfig): r""" Example: @@ -221,33 +173,27 @@ class Qwen3_5MoeConfig(PreTrainedConfig): sub_configs = {"vision_config": Qwen3_5MoeVisionConfig, "text_config": Qwen3_5MoeTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=248056, - video_token_id=248057, - vision_start_token_id=248053, - vision_end_token_id=248054, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + + image_token_id: int = 248056 + video_token_id: int = 248057 + vision_start_token_id: int = 248053 + vision_end_token_id: int = 248054 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: self.text_config = self.sub_configs["text_config"]() - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.vision_start_token_id = vision_start_token_id - self.vision_end_token_id = vision_end_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Qwen3_5MoeConfig", "Qwen3_5MoeTextConfig"] diff --git a/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py index cef308d80a07..ae052734e7e1 100644 --- a/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py +++ b/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py @@ -14,11 +14,11 @@ """PyTorch Qwen3.5Moe model.""" import torch +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPooling -from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, logging from ..qwen3_5.configuration_qwen3_5 import Qwen3_5VisionConfig @@ -55,6 +55,7 @@ @auto_docstring(checkpoint="Qwen/Qwen3.5-35B-A3B") +@strict(accept_kwargs=True) class Qwen3_5MoeTextConfig(Qwen3NextConfig): r""" linear_conv_kernel_dim (`int`, *optional*, defaults to 4): @@ -99,58 +100,31 @@ class Qwen3_5MoeTextConfig(Qwen3NextConfig): "layers.*.mlp.shared_expert.up_proj": "colwise", "layers.*.mlp.shared_expert.down_proj": "rowwise", } - - def __init__( - self, - vocab_size=248320, - hidden_size=2048, - num_hidden_layers=40, - num_attention_heads=16, - num_key_value_heads=2, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias=False, - attention_dropout=0.0, - head_dim=256, - linear_conv_kernel_dim=4, - linear_key_head_dim=128, - linear_value_head_dim=128, - linear_num_key_heads=16, - linear_num_value_heads=32, - moe_intermediate_size=512, - shared_expert_intermediate_size=512, - num_experts_per_tok=8, - num_experts=256, - output_router_logits=False, - router_aux_loss_coef=0.001, - layer_types=None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - kwargs["ignore_keys_at_rope_validation"] = {"mrope_section", "mrope_interleaved"} - super().__init__( - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - del self.intermediate_size - del self.decoder_sparse_step - del self.norm_topk_prob + ignore_keys_at_rope_validation = {"mrope_section", "mrope_interleaved"} + + vocab_size: int = 248320 + hidden_size: int = 2048 + num_hidden_layers: int = 40 + num_experts_per_tok: int = 8 + num_experts: int = 256 + intermediate_size = AttributeError() + decoder_sparse_step = AttributeError() + norm_topk_prob = AttributeError() + mlp_only_layers = AttributeError() + + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) del self.mlp_only_layers @auto_docstring(checkpoint="Qwen/Qwen3.5-35B-A3B") +@strict(accept_kwargs=True) class Qwen3_5MoeVisionConfig(Qwen3_5VisionConfig): pass @auto_docstring(checkpoint="Qwen/Qwen3.5-35B-A3B") +@strict(accept_kwargs=True) class Qwen3_5MoeConfig(Qwen3VLConfig): r""" Example: @@ -168,30 +142,10 @@ class Qwen3_5MoeConfig(Qwen3VLConfig): >>> configuration = model.config ```""" - model_type = "qwen3_5_moe" - sub_configs = {"vision_config": Qwen3_5MoeVisionConfig, "text_config": Qwen3_5MoeTextConfig} - - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=248056, - video_token_id=248057, - vision_start_token_id=248053, - vision_end_token_id=248054, - tie_word_embeddings=False, - **kwargs, - ): - super().__init__( - text_config=text_config, - vision_config=vision_config, - image_token_id=image_token_id, - video_token_id=video_token_id, - vision_start_token_id=vision_start_token_id, - vision_end_token_id=vision_end_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + image_token_id: int = 248056 + video_token_id: int = 248057 + vision_start_token_id: int = 248053 + vision_end_token_id: int = 248054 class Qwen3_5MoeVisionRotaryEmbedding(Qwen3_5VisionRotaryEmbedding): diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index c422bfac9183..1abb45000bb5 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -13,15 +13,15 @@ # limitations under the License. """Qwen3MoE model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3MoeConfig(PreTrainedConfig): r""" decoder_sparse_step (`int`, *optional*, defaults to 1): @@ -72,71 +72,39 @@ class Qwen3MoeConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 151936, - hidden_size: int | None = 2048, - intermediate_size: int | None = 6144, - num_hidden_layers: int | None = 24, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 4, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - use_sliding_window: bool | None = False, - sliding_window: int | None = 4096, - attention_dropout: float | None = 0.0, - decoder_sparse_step: int | None = 1, - moe_intermediate_size: int | None = 768, - num_experts_per_tok: int | None = 8, - num_experts: int | None = 128, - norm_topk_prob: bool | None = False, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - mlp_only_layers: bool | None = None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window if use_sliding_window else None - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - # MoE arguments - self.decoder_sparse_step = decoder_sparse_step - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.norm_topk_prob = norm_topk_prob - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vocab_size: int = 151936 + hidden_size: int = 2048 + intermediate_size: int = 6144 + num_hidden_layers: int = 24 + num_attention_heads: int = 32 + num_key_value_heads: int = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + use_sliding_window: bool = False + sliding_window: int | None = 4096 + attention_dropout: float | int = 0.0 + decoder_sparse_step: int = 1 + moe_intermediate_size: int = 768 + num_experts_per_tok: int = 8 + num_experts: int = 128 + norm_topk_prob: bool = False + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + mlp_only_layers: list[int] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): + self.sliding_window = self.sliding_window if self.use_sliding_window else None + self.mlp_only_layers = [] if self.mlp_only_layers is None else self.mlp_only_layers + super().__post_init__(**kwargs) __all__ = ["Qwen3MoeConfig"] diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index bb42989828d4..73d0e277bb7d 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -13,15 +13,15 @@ # limitations under the License. """Qwen3-Next model configuration""" -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters +from ...utils import auto_docstring @auto_docstring(checkpoint="Qwen/Qwen3-Next-80B-A3B-Instruct") +@strict(accept_kwargs=True) class Qwen3NextConfig(PreTrainedConfig): r""" linear_conv_kernel_dim (`int`, *optional*, defaults to 4): @@ -81,92 +81,52 @@ class Qwen3NextConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 151936, - hidden_size: int | None = 2048, - intermediate_size: int | None = 5632, - num_hidden_layers: int | None = 48, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 2, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-6, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - head_dim: int | None = 256, - linear_conv_kernel_dim: int | None = 4, - linear_key_head_dim: int | None = 128, - linear_value_head_dim: int | None = 128, - linear_num_key_heads: int | None = 16, - linear_num_value_heads: int | None = 32, - decoder_sparse_step: int | None = 1, - moe_intermediate_size: int | None = 512, - shared_expert_intermediate_size: int | None = 512, - num_experts_per_tok: int | None = 10, - num_experts: int | None = 512, - norm_topk_prob: bool | None = True, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - mlp_only_layers: list[int] | None = [], - layer_types: list[str] | None = None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.head_dim = head_dim - self.rope_parameters = rope_parameters + vocab_size: int = 151936 + hidden_size: int = 2048 + intermediate_size: int = 5632 + num_hidden_layers: int = 48 + num_attention_heads: int = 16 + num_key_value_heads: int = 2 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + head_dim: int = 256 + linear_conv_kernel_dim: int = 4 + linear_key_head_dim: int = 128 + linear_value_head_dim: int = 128 + linear_num_key_heads: int = 16 + linear_num_value_heads: int = 32 + decoder_sparse_step: int = 1 + moe_intermediate_size: int = 512 + shared_expert_intermediate_size: int = 512 + num_experts_per_tok: int = 10 + num_experts: int = 512 + norm_topk_prob: bool = True + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + mlp_only_layers: list[int] | None = None + layer_types: list[str] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): kwargs.setdefault("partial_rotary_factor", 0.25) # assign default for BC - - self.layer_types = layer_types + self.mlp_only_layers = [] if self.mlp_only_layers is None else self.mlp_only_layers if self.layer_types is None: - interval_pattern = kwargs.get("full_attention_interval", 4) + interval_pattern = kwargs.pop("full_attention_interval", 4) self.layer_types = [ "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - # linear attention part - self.linear_conv_kernel_dim = linear_conv_kernel_dim - self.linear_key_head_dim = linear_key_head_dim - self.linear_value_head_dim = linear_value_head_dim - self.linear_num_key_heads = linear_num_key_heads - self.linear_num_value_heads = linear_num_value_heads - # MoE arguments - self.decoder_sparse_step = decoder_sparse_step - self.moe_intermediate_size = moe_intermediate_size - self.shared_expert_intermediate_size = shared_expert_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.norm_topk_prob = norm_topk_prob - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.mlp_only_layers = mlp_only_layers - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Qwen3NextConfig"] diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 3ef96837f664..d307ed48fd52 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -18,7 +18,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging @@ -27,6 +29,7 @@ @auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B") +@strict(accept_kwargs=True) class Qwen3OmniMoeAudioEncoderConfig(PreTrainedConfig): r""" downsample_hidden_size ( `int`, *optional*, defaults to `480`): Hidden size in donwsampling layer @@ -38,51 +41,31 @@ class Qwen3OmniMoeAudioEncoderConfig(PreTrainedConfig): """ model_type = "qwen3_omni_moe_audio_encoder" + attribute_map = {"num_hidden_layers": "encoder_layers"} + + num_mel_bins: int = 128 + encoder_layers: int = 32 + encoder_attention_heads: int = 20 + encoder_ffn_dim: int = 5120 + d_model: int = 1280 + dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + activation_function: str = "gelu" + activation_dropout: float | int = 0.0 + scale_embedding: bool = False + initializer_range: float = 0.02 + max_source_positions: int = 1500 - def __init__( - self, - num_mel_bins: int | None = 128, - encoder_layers: int | None = 32, - encoder_attention_heads: int | None = 20, - encoder_ffn_dim: int | None = 5120, - d_model: int | None = 1280, - dropout: int | None = 0, - attention_dropout: int | None = 0, - activation_function: int | None = "gelu", - activation_dropout: int | None = 0, - scale_embedding: int | None = False, - initializer_range: int | None = 0.02, - max_source_positions: int | None = 1500, - n_window: int | None = 100, - output_dim: int | None = 3584, - n_window_infer: int | None = 400, - conv_chunksize: int | None = 500, - downsample_hidden_size: int | None = 480, - **kwargs, - ): - super().__init__(**kwargs) - - self.num_mel_bins = num_mel_bins - self.d_model = d_model - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.encoder_ffn_dim = encoder_ffn_dim - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_function = activation_function - self.activation_dropout = activation_dropout - self.num_hidden_layers = encoder_layers - self.initializer_range = initializer_range - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.max_source_positions = max_source_positions - self.n_window = n_window - self.output_dim = output_dim - self.n_window_infer = n_window_infer - self.conv_chunksize = conv_chunksize - self.downsample_hidden_size = downsample_hidden_size + n_window: int = 100 + output_dim: int = 3584 + + n_window_infer: int = 400 + conv_chunksize: int = 500 + downsample_hidden_size: int = 480 @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeVisionEncoderConfig(PreTrainedConfig): r""" num_position_embeddings (`int`, *optional*, defaults to 2304): @@ -96,41 +79,23 @@ class Qwen3OmniMoeVisionEncoderConfig(PreTrainedConfig): model_type = "qwen3_omni_moe_vision_encoder" base_config_key = "vision_config" - def __init__( - self, - depth=27, - hidden_size=1152, - hidden_act="gelu_pytorch_tanh", - intermediate_size=4304, - num_heads=16, - in_channels=3, - patch_size=16, - spatial_merge_size=2, - temporal_patch_size=2, - out_hidden_size=3584, - num_position_embeddings=2304, - deepstack_visual_indexes=[8, 16, 24], - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.out_hidden_size = out_hidden_size - self.num_position_embeddings = num_position_embeddings - self.initializer_range = initializer_range - self.deepstack_visual_indexes = deepstack_visual_indexes + depth: int = 27 + hidden_size: int = 1152 + hidden_act: str = "gelu_pytorch_tanh" + intermediate_size: int = 4304 + num_heads: int = 16 + in_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 16 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + out_hidden_size: int = 3584 + num_position_embeddings: int = 2304 + deepstack_visual_indexes: list[int] | tuple[int, ...] = (8, 16, 24) + initializer_range: float = 0.02 @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeTextConfig(PreTrainedConfig): r""" decoder_sparse_step (`int`, *optional*, defaults to 1): @@ -174,74 +139,43 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - - def __init__( - self, - vocab_size: int | None = 3584, - hidden_size: int | None = 2048, - intermediate_size: int | None = 18944, - num_hidden_layers: int | None = 28, - num_attention_heads: int | None = 28, - num_key_value_heads: int | None = 4, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-6, - use_cache: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - sliding_window: int | None = None, - attention_dropout: int | None = 0, - decoder_sparse_step: int | None = 1, - moe_intermediate_size: int | None = 768, - num_experts_per_tok: int | None = 8, - num_experts: int | None = 128, - norm_topk_prob: bool | None = True, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - mlp_only_layers: list[int] | None = None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - # MoE arguments - self.decoder_sparse_step = decoder_sparse_step - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.norm_topk_prob = norm_topk_prob - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - super().__init__( - ignore_keys_at_rope_validation={"mrope_section", "interleaved", "mrope_interleaved"}, - **kwargs, - ) + ignore_keys_at_rope_validation = {"mrope_section", "interleaved", "mrope_interleaved"} + + vocab_size: int = 3584 + hidden_size: int = 2048 + intermediate_size: int = 18944 + num_hidden_layers: int = 28 + num_attention_heads: int = 28 + num_key_value_heads: int = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + sliding_window: int | None = None + attention_dropout: float | int = 0.0 + decoder_sparse_step: int = 1 + moe_intermediate_size: int = 768 + num_experts_per_tok: int = 8 + num_experts: int = 128 + norm_topk_prob: bool = True + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + mlp_only_layers: list[int] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): + self.mlp_only_layers = [] if self.mlp_only_layers is None else self.mlp_only_layers + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeThinkerConfig(PreTrainedConfig): r""" position_id_per_seconds (`int`, *optional*, defaults to 25): @@ -275,52 +209,40 @@ class Qwen3OmniMoeThinkerConfig(PreTrainedConfig): "text_config": Qwen3OmniMoeTextConfig, } - def __init__( - self, - audio_config=None, - vision_config=None, - text_config=None, - audio_token_id=151646, - image_token_id=151655, - video_token_id=151656, - position_id_per_seconds=25, - audio_start_token_id=151647, - user_token_id=872, - initializer_range=0.02, - tie_word_embeddings=False, - **kwargs, - ): - self.user_token_id = user_token_id - self.position_id_per_seconds = position_id_per_seconds - self.audio_start_token_id = audio_start_token_id - self.initializer_range = initializer_range - self.tie_word_embeddings = tie_word_embeddings - - if isinstance(vision_config, dict): - vision_config = Qwen3OmniMoeVisionEncoderConfig(**vision_config) - elif vision_config is None: - vision_config = Qwen3OmniMoeVisionEncoderConfig() - self.vision_config = vision_config - - if isinstance(audio_config, dict): - audio_config = Qwen3OmniMoeAudioEncoderConfig(**audio_config) - elif audio_config is None: - audio_config = Qwen3OmniMoeAudioEncoderConfig() - self.audio_config = audio_config - - if isinstance(text_config, dict): - text_config = Qwen3OmniMoeTextConfig(**text_config) - elif text_config is None: - text_config = Qwen3OmniMoeTextConfig() - self.text_config = text_config - - super().__init__(**kwargs) - self.audio_token_id = audio_token_id - self.image_token_id = image_token_id - self.video_token_id = video_token_id + audio_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + position_id_per_seconds: int = 25 + audio_start_token_id: int = 151647 + user_token_id: int = 872 + initializer_range: float = 0.02 + tie_word_embeddings: bool = False + + audio_token_id: int = 151646 + image_token_id: int = 151655 + video_token_id: int = 151656 + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = Qwen3OmniMoeVisionEncoderConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = Qwen3OmniMoeVisionEncoderConfig() + + if isinstance(self.audio_config, dict): + self.audio_config = Qwen3OmniMoeAudioEncoderConfig(**self.audio_config) + elif self.audio_config is None: + self.audio_config = Qwen3OmniMoeAudioEncoderConfig() + + if isinstance(self.text_config, dict): + self.text_config = Qwen3OmniMoeTextConfig(**self.text_config) + elif self.text_config is None: + self.text_config = Qwen3OmniMoeTextConfig() + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3OmniMoeTalkerCodePredictor-8B") +@strict(accept_kwargs=True) class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig): r""" max_window_layers (`int`, *optional*, defaults to 28): @@ -351,57 +273,35 @@ class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 2048, - hidden_size: int | None = 1024, - intermediate_size: int | None = 3072, - num_hidden_layers: int | None = 5, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 8, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 0.000001, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: int | None = None, - attention_bias: bool | None = False, - sliding_window: int | None = None, - max_window_layers: int | None = 28, - layer_types: list[str] | None = None, - attention_dropout: int | None = 0, - num_code_groups: int | None = 32, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.num_code_groups = num_code_groups - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - self.max_window_layers = max_window_layers - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.head_dim = head_dim - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - self.layer_types = layer_types + vocab_size: int = 2048 + hidden_size: int = 1024 + intermediate_size: int = 3072 + num_hidden_layers: int = 5 + num_attention_heads: int = 16 + num_key_value_heads: int = 8 + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + sliding_window: int | None = None + max_window_layers: int = 28 + layer_types: list[str] | None = None + attention_dropout: float | int = 0.0 + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + num_code_groups: int = 32 + + def __post_init__(self, **kwargs): + self.sliding_window = self.sliding_window + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + if self.layer_types is None: self.layer_types = [ "sliding_attention" @@ -409,18 +309,11 @@ def __init__( else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig): r""" decoder_sparse_step (`int`, *optional*, defaults to 1): @@ -470,73 +363,42 @@ class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - - def __init__( - self, - vocab_size: int | None = 3072, - hidden_size: int | None = 1024, - intermediate_size: int | None = 2048, - num_hidden_layers: int | None = 20, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 2, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 0.000001, - use_cache: int | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - sliding_window: int | None = None, - attention_dropout: int | None = 0, - decoder_sparse_step: int | None = 1, - moe_intermediate_size: int | None = 384, - num_experts_per_tok: int | None = 8, - num_experts: int | None = 128, - norm_topk_prob: bool | None = False, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - mlp_only_layers: list[int] | None = None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - # MoE arguments - self.decoder_sparse_step = decoder_sparse_step - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.norm_topk_prob = norm_topk_prob - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vocab_size: int = 3072 + hidden_size: int = 1024 + intermediate_size: int = 2048 + num_hidden_layers: int = 20 + num_attention_heads: int = 16 + num_key_value_heads: int = 2 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + sliding_window: int | None = None + attention_dropout: float | int = 0.0 + decoder_sparse_step: int = 1 + moe_intermediate_size: int = 384 + num_experts_per_tok: int = 8 + num_experts: int = 128 + norm_topk_prob: bool = False + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + mlp_only_layers: list[int] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): + self.sliding_window = self.sliding_window + self.mlp_only_layers = [] if self.mlp_only_layers is None else self.mlp_only_layers + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeTalkerConfig(PreTrainedConfig): r""" code_predictor_config (`dict`, *optional*): @@ -592,66 +454,45 @@ class Qwen3OmniMoeTalkerConfig(PreTrainedConfig): "text_config": Qwen3OmniMoeTalkerTextConfig, } - def __init__( - self, - code_predictor_config=None, - text_config=None, - num_code_groups=32, - thinker_hidden_size=2048, - codec_eos_token_id=4198, - accept_hidden_layer=18, - codec_nothink_id=4203, - codec_think_bos_id=4204, - codec_think_eos_id=4205, - codec_pad_id=4196, - codec_bos_id=4197, - audio_token_id=151646, - image_token_id=151655, - video_token_id=151656, - vision_start_token_id=151652, - position_id_per_seconds=25, - audio_start_token_id=151669, - speaker_id=None, - **kwargs, - ): - if code_predictor_config is None: - code_predictor_config = {} + code_predictor_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + num_code_groups: int = 32 + thinker_hidden_size: int = 2048 + codec_eos_token_id: int = 4198 + accept_hidden_layer: int = 18 + codec_nothink_id: int = 4203 + codec_think_bos_id: int = 4204 + codec_think_eos_id: int = 4205 + codec_pad_id: int = 4196 + codec_bos_id: int = 4197 + audio_token_id: int = 151646 + image_token_id: int = 151655 + video_token_id: int = 151656 + vision_start_token_id: int = 151652 + position_id_per_seconds: int = 25 + audio_start_token_id: int = 151669 + speaker_id: dict | None = None + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.code_predictor_config is None: + self.code_predictor_config = {} self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig() logger.info("code_predictor_config is None. Initializing code_predictor_config model with default values") - elif isinstance(code_predictor_config, Qwen3OmniMoeTalkerCodePredictorConfig): - self.code_predictor_config = code_predictor_config else: - self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig(**code_predictor_config) + self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig(**self.code_predictor_config) - if text_config is None: - text_config = {} + if self.text_config is None: + self.text_config = {} self.text_config = Qwen3OmniMoeTalkerTextConfig() logger.info("talker text_config is None. Initializing talker text model with default values") - elif isinstance(text_config, Qwen3OmniMoeTalkerTextConfig): - self.text_config = text_config else: - self.text_config = Qwen3OmniMoeTalkerTextConfig(**text_config) - self.num_code_groups = num_code_groups - self.thinker_hidden_size = thinker_hidden_size - self.codec_eos_token_id = codec_eos_token_id - self.accept_hidden_layer = accept_hidden_layer - self.codec_nothink_id = codec_nothink_id - self.codec_think_bos_id = codec_think_bos_id - self.codec_think_eos_id = codec_think_eos_id - self.codec_pad_id = codec_pad_id - self.codec_bos_id = codec_bos_id - self.audio_token_id = audio_token_id - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.position_id_per_seconds = position_id_per_seconds - self.audio_start_token_id = audio_start_token_id - self.vision_start_token_id = vision_start_token_id - self.speaker_id = speaker_id - self.initializer_range = self.text_config.initializer_range - super().__init__(**kwargs) + self.text_config = Qwen3OmniMoeTalkerTextConfig(**self.text_config) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig): r""" num_quantizers (`int`, *optional*, defaults to 16): @@ -678,50 +519,25 @@ class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig): >>> config = model.config ```""" - def __init__( - self, - codebook_size=2048, - hidden_size=1024, - max_position_embeddings=8000, - rope_parameters: RopeParameters | dict[RopeParameters] | None = None, - num_attention_heads=16, - num_key_value_heads=16, - attention_bias=False, - sliding_window=72, - intermediate_size=3072, - hidden_act="silu", - layer_scale_initial_scale=0.01, - rms_norm_eps=1e-5, - num_hidden_layers=8, - num_quantizers=16, - upsample_rates=(8, 5, 4, 3), - upsampling_ratios=(2, 2), - decoder_dim=1536, - attention_dropout=0.0, - initializer_range=0.02, - **kwargs, - ): - self.codebook_size = codebook_size - self.hidden_size = hidden_size - self.max_position_embeddings = max_position_embeddings - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.attention_bias = attention_bias - self.sliding_window = sliding_window - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.layer_scale_initial_scale = layer_scale_initial_scale - self.rms_norm_eps = rms_norm_eps - self.num_hidden_layers = num_hidden_layers - self.num_quantizers = num_quantizers - self.upsample_rates = upsample_rates - self.upsampling_ratios = upsampling_ratios - self.decoder_dim = decoder_dim - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + codebook_size: int = 2048 + hidden_size: int = 1024 + max_position_embeddings: int = 8000 + rope_parameters: RopeParameters | dict | None = None + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + attention_bias: bool = False + sliding_window: int = 72 + intermediate_size: int = 3072 + hidden_act: str = "silu" + layer_scale_initial_scale: float = 0.01 + rms_norm_eps: float = 1e-5 + num_hidden_layers: int = 8 + num_quantizers: int = 16 + upsample_rates: list[int] | tuple[int, ...] = (8, 5, 4, 3) + upsampling_ratios: list[int] | tuple[int, ...] = (2, 2) + decoder_dim: int = 1536 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 @property def layer_types(self): @@ -732,6 +548,7 @@ def layer_types(self): @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeConfig(PreTrainedConfig): r""" thinker_config (`dict`, *optional*): Configuration of the underlying thinker sub-model. @@ -775,48 +592,39 @@ class Qwen3OmniMoeConfig(PreTrainedConfig): "code2wav_config": Qwen3OmniMoeCode2WavConfig, } - def __init__( - self, - thinker_config=None, - talker_config=None, - code2wav_config=None, - enable_audio_output=True, - im_start_token_id=151644, - im_end_token_id=151645, - tts_pad_token_id=151671, - tts_bos_token_id=151672, - tts_eos_token_id=151673, - system_token_id=8948, - user_token_id=872, - assistant_token_id=77091, - **kwargs, - ): - if thinker_config is None: - thinker_config = {} + thinker_config: dict | PreTrainedConfig | None = None + talker_config: dict | PreTrainedConfig | None = None + code2wav_config: dict | PreTrainedConfig | None = None + enable_audio_output: bool = True + im_start_token_id: int = 151644 + im_end_token_id: int = 151645 + tts_pad_token_id: int = 151671 + tts_bos_token_id: int = 151672 + tts_eos_token_id: int = 151673 + system_token_id: int = 8948 + user_token_id: int = 872 + assistant_token_id: int = 77091 + + def __post_init__(self, **kwargs): + if self.thinker_config is None: + self.thinker_config = Qwen3OmniMoeThinkerConfig() logger.info("thinker_config is None. Initializing thinker model with default values") + elif isinstance(self.thinker_config, dict): + self.thinker_config = Qwen3OmniMoeThinkerConfig(**self.thinker_config) - if talker_config is None: - talker_config = {} + if self.talker_config is None: + self.talker_config = Qwen3OmniMoeTalkerConfig() logger.info("talker_config is None. Initializing talker model with default values") + elif isinstance(self.talker_config, dict): + self.talker_config = Qwen3OmniMoeTalkerConfig(**self.talker_config) + + if self.code2wav_config is None: + self.code2wav_config = Qwen3OmniMoeCode2WavConfig() + logger.info("code2wav_config is None. Initializing code2wav_config model with default values") + elif isinstance(self.code2wav_config, dict): + self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**self.code2wav_config) - if code2wav_config is None: - code2wav_config = {} - logger.info("code2wav_config is None. Initializing code2wav model with default values") - - self.thinker_config = Qwen3OmniMoeThinkerConfig(**thinker_config) - self.talker_config = Qwen3OmniMoeTalkerConfig(**talker_config) - self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**code2wav_config) - self.initializer_range = self.thinker_config.initializer_range - self.enable_audio_output = enable_audio_output - self.im_start_token_id = im_start_token_id - self.im_end_token_id = im_end_token_id - self.tts_pad_token_id = tts_pad_token_id - self.tts_bos_token_id = tts_bos_token_id - self.tts_eos_token_id = tts_eos_token_id - self.system_token_id = system_token_id - self.user_token_id = user_token_id - self.assistant_token_id = assistant_token_id - super().__init__(**kwargs) + super().__post_init__(**kwargs) def get_text_config(self, decoder=False) -> "PreTrainedConfig": """ diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py index ed8621cbbee0..a575b3e88dae 100644 --- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py @@ -3904,7 +3904,7 @@ def generate( use_audio_in_video: bool = False, return_audio: bool | None = None, thinker_max_new_tokens: int = 1024, - thinker_eos_token_id: int = 151645, + thinker_eos_token_id: int | list[int] | None = 151645, talker_max_new_tokens: int = 4096, talker_do_sample: bool = True, talker_top_k: int = 50, diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 8e548f703845..6f5ec59f0bbd 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -20,6 +20,7 @@ import numpy as np import torch +from huggingface_hub.dataclasses import strict from torch import nn from torch.nn import functional as F @@ -134,55 +135,19 @@ class Qwen3OmniMoeAudioEncoderConfig(Qwen2_5OmniAudioEncoderConfig): output_dim (`int`, *optional*, defaults to 3584): Dimensionality of the output """ - def __init__( - self, - num_mel_bins: int | None = 128, - encoder_layers: int | None = 32, - encoder_attention_heads: int | None = 20, - encoder_ffn_dim: int | None = 5120, - d_model: int | None = 1280, - dropout: int | None = 0, - attention_dropout: int | None = 0, - activation_function: int | None = "gelu", - activation_dropout: int | None = 0, - scale_embedding: int | None = False, - initializer_range: int | None = 0.02, - max_source_positions: int | None = 1500, - n_window: int | None = 100, - output_dim: int | None = 3584, - n_window_infer: int | None = 400, - conv_chunksize: int | None = 500, - downsample_hidden_size: int | None = 480, - **kwargs, - ): - super().__init__( - num_mel_bins, - encoder_layers, - encoder_attention_heads, - encoder_ffn_dim, - d_model, - dropout, - attention_dropout, - activation_function, - activation_dropout, - scale_embedding, - initializer_range, - max_source_positions, - n_window, - output_dim, - **kwargs, - ) - self.n_window_infer = n_window_infer - self.conv_chunksize = conv_chunksize - self.downsample_hidden_size = downsample_hidden_size + n_window_infer: int = 400 + conv_chunksize: int = 500 + downsample_hidden_size: int = 480 @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeVisionEncoderConfig(Qwen3VLMoeVisionConfig): pass @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeTextConfig(PreTrainedConfig): r""" decoder_sparse_step (`int`, *optional*, defaults to 1): @@ -226,74 +191,43 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - - def __init__( - self, - vocab_size: int | None = 3584, - hidden_size: int | None = 2048, - intermediate_size: int | None = 18944, - num_hidden_layers: int | None = 28, - num_attention_heads: int | None = 28, - num_key_value_heads: int | None = 4, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-6, - use_cache: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - sliding_window: int | None = None, - attention_dropout: int | None = 0, - decoder_sparse_step: int | None = 1, - moe_intermediate_size: int | None = 768, - num_experts_per_tok: int | None = 8, - num_experts: int | None = 128, - norm_topk_prob: bool | None = True, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - mlp_only_layers: list[int] | None = None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - # MoE arguments - self.decoder_sparse_step = decoder_sparse_step - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.norm_topk_prob = norm_topk_prob - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - super().__init__( - ignore_keys_at_rope_validation={"mrope_section", "interleaved", "mrope_interleaved"}, - **kwargs, - ) + ignore_keys_at_rope_validation = {"mrope_section", "interleaved", "mrope_interleaved"} + + vocab_size: int = 3584 + hidden_size: int = 2048 + intermediate_size: int = 18944 + num_hidden_layers: int = 28 + num_attention_heads: int = 28 + num_key_value_heads: int = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + sliding_window: int | None = None + attention_dropout: float | int = 0.0 + decoder_sparse_step: int = 1 + moe_intermediate_size: int = 768 + num_experts_per_tok: int = 8 + num_experts: int = 128 + norm_topk_prob: bool = True + output_router_logits: bool = False + router_aux_loss_coef: float = 0.001 + mlp_only_layers: list[int] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + + def __post_init__(self, **kwargs): + self.mlp_only_layers = [] if self.mlp_only_layers is None else self.mlp_only_layers + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeThinkerConfig(Qwen2_5OmniThinkerConfig): r""" position_id_per_seconds (`int`, *optional*, defaults to 25): @@ -322,44 +256,15 @@ class Qwen3OmniMoeThinkerConfig(Qwen2_5OmniThinkerConfig): # Override parent's attribute_map as we use audio_token_id directly, not audio_token_index attribute_map = {} - def __init__( - self, - audio_config=None, - vision_config=None, - text_config=None, - audio_token_id=151646, - image_token_id=151655, - video_token_id=151656, - position_id_per_seconds=25, - audio_start_token_id=151647, - user_token_id=872, - initializer_range=0.02, - tie_word_embeddings=False, - **kwargs, - ): - super().__init__( - audio_config, - vision_config, - text_config, - None, - None, - None, - position_id_per_seconds, - None, - audio_start_token_id, - None, - user_token_id, - initializer_range, - **kwargs, - ) - del self.seconds_per_chunk - del self.audio_token_index - del self.image_token_index - del self.video_token_index - del self.audio_end_token_id - self.audio_token_id = audio_token_id - self.image_token_id = image_token_id - self.video_token_id = video_token_id + audio_token_id: int = 151646 + image_token_id: int = 151655 + video_token_id: int = 151656 + + seconds_per_chunk = AttributeError() + audio_token_index = AttributeError() + image_token_index = AttributeError() + video_token_index = AttributeError() + audio_end_token_id = AttributeError() class Qwen3OmniMoeTalkerCodePredictorConfig(Qwen3Config): @@ -371,133 +276,39 @@ class Qwen3OmniMoeTalkerCodePredictorConfig(Qwen3Config): Number of codebook groups used in the predicted acoustic token sequence, corresponding to multi-codebook VQ representation. """ - def __init__( - self, - vocab_size: int | None = 2048, - hidden_size: int | None = 1024, - intermediate_size: int | None = 3072, - num_hidden_layers: int | None = 5, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 8, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 0.000001, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: int | None = None, - attention_bias: bool | None = False, - sliding_window: int | None = None, - max_window_layers: int | None = 28, - layer_types: list[str] | None = None, - attention_dropout: int | None = 0, - num_code_groups: int | None = 32, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - self.num_code_groups = num_code_groups - super().__init__( - vocab_size, - hidden_size, - intermediate_size, - num_hidden_layers, - num_attention_heads, - num_key_value_heads, - head_dim, - hidden_act, - max_position_embeddings, - initializer_range, - rms_norm_eps, - use_cache, - tie_word_embeddings, - rope_parameters, - attention_bias, - False, - sliding_window, - None, - layer_types, - attention_dropout, - pad_token_id, - bos_token_id, - eos_token_id, - **kwargs, - ) - del self.use_sliding_window - self.sliding_window = sliding_window - self.max_window_layers = max_window_layers + vocab_size: int = 2048 + hidden_size: int = 1024 + intermediate_size: int = 3072 + num_hidden_layers: int = 5 + num_attention_heads: int = 16 + num_key_value_heads: int = 8 + sliding_window: int | None = None + num_code_groups: int = 32 + use_sliding_window = AttributeError() + + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) + self.sliding_window = self.sliding_window class Qwen3OmniMoeTalkerTextConfig(Qwen3MoeConfig): - def __init__( - self, - vocab_size: int | None = 3072, - hidden_size: int | None = 1024, - intermediate_size: int | None = 2048, - num_hidden_layers: int | None = 20, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 2, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 0.000001, - use_cache: int | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - sliding_window: int | None = None, - attention_dropout: int | None = 0, - decoder_sparse_step: int | None = 1, - moe_intermediate_size: int | None = 384, - num_experts_per_tok: int | None = 8, - num_experts: int | None = 128, - norm_topk_prob: bool | None = False, - output_router_logits: bool | None = False, - router_aux_loss_coef: float | None = 0.001, - mlp_only_layers: list[int] | None = None, - pad_token_id: int | None = None, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - **kwargs, - ): - super().__init__( - vocab_size, - hidden_size, - intermediate_size, - num_hidden_layers, - num_attention_heads, - num_key_value_heads, - hidden_act, - max_position_embeddings, - initializer_range, - rms_norm_eps, - use_cache, - tie_word_embeddings, - rope_parameters, - attention_bias, - False, - sliding_window, - attention_dropout, - decoder_sparse_step, - moe_intermediate_size, - num_experts_per_tok, - num_experts, - norm_topk_prob, - output_router_logits, - router_aux_loss_coef, - mlp_only_layers, - pad_token_id, - bos_token_id, - eos_token_id, - **kwargs, - ) - del self.use_sliding_window - self.sliding_window = sliding_window + vocab_size: int = 3072 + hidden_size: int = 1024 + intermediate_size: int = 2048 + num_hidden_layers: int = 20 + num_attention_heads: int = 16 + num_key_value_heads: int = 2 + sliding_window: int | None = None + moe_intermediate_size: int = 384 + use_sliding_window = AttributeError() + + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) + self.sliding_window = self.sliding_window @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeTalkerConfig(PreTrainedConfig): r""" code_predictor_config (`dict`, *optional*): @@ -553,66 +364,45 @@ class Qwen3OmniMoeTalkerConfig(PreTrainedConfig): "text_config": Qwen3OmniMoeTalkerTextConfig, } - def __init__( - self, - code_predictor_config=None, - text_config=None, - num_code_groups=32, - thinker_hidden_size=2048, - codec_eos_token_id=4198, - accept_hidden_layer=18, - codec_nothink_id=4203, - codec_think_bos_id=4204, - codec_think_eos_id=4205, - codec_pad_id=4196, - codec_bos_id=4197, - audio_token_id=151646, - image_token_id=151655, - video_token_id=151656, - vision_start_token_id=151652, - position_id_per_seconds=25, - audio_start_token_id=151669, - speaker_id=None, - **kwargs, - ): - if code_predictor_config is None: - code_predictor_config = {} + code_predictor_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + num_code_groups: int = 32 + thinker_hidden_size: int = 2048 + codec_eos_token_id: int = 4198 + accept_hidden_layer: int = 18 + codec_nothink_id: int = 4203 + codec_think_bos_id: int = 4204 + codec_think_eos_id: int = 4205 + codec_pad_id: int = 4196 + codec_bos_id: int = 4197 + audio_token_id: int = 151646 + image_token_id: int = 151655 + video_token_id: int = 151656 + vision_start_token_id: int = 151652 + position_id_per_seconds: int = 25 + audio_start_token_id: int = 151669 + speaker_id: dict | None = None + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.code_predictor_config is None: + self.code_predictor_config = {} self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig() logger.info("code_predictor_config is None. Initializing code_predictor_config model with default values") - elif isinstance(code_predictor_config, Qwen3OmniMoeTalkerCodePredictorConfig): - self.code_predictor_config = code_predictor_config else: - self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig(**code_predictor_config) + self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig(**self.code_predictor_config) - if text_config is None: - text_config = {} + if self.text_config is None: + self.text_config = {} self.text_config = Qwen3OmniMoeTalkerTextConfig() logger.info("talker text_config is None. Initializing talker text model with default values") - elif isinstance(text_config, Qwen3OmniMoeTalkerTextConfig): - self.text_config = text_config else: - self.text_config = Qwen3OmniMoeTalkerTextConfig(**text_config) - self.num_code_groups = num_code_groups - self.thinker_hidden_size = thinker_hidden_size - self.codec_eos_token_id = codec_eos_token_id - self.accept_hidden_layer = accept_hidden_layer - self.codec_nothink_id = codec_nothink_id - self.codec_think_bos_id = codec_think_bos_id - self.codec_think_eos_id = codec_think_eos_id - self.codec_pad_id = codec_pad_id - self.codec_bos_id = codec_bos_id - self.audio_token_id = audio_token_id - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.position_id_per_seconds = position_id_per_seconds - self.audio_start_token_id = audio_start_token_id - self.vision_start_token_id = vision_start_token_id - self.speaker_id = speaker_id - self.initializer_range = self.text_config.initializer_range - super().__init__(**kwargs) + self.text_config = Qwen3OmniMoeTalkerTextConfig(**self.text_config) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig): r""" num_quantizers (`int`, *optional*, defaults to 16): @@ -639,50 +429,25 @@ class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig): >>> config = model.config ```""" - def __init__( - self, - codebook_size=2048, - hidden_size=1024, - max_position_embeddings=8000, - rope_parameters: RopeParameters | dict[RopeParameters] | None = None, - num_attention_heads=16, - num_key_value_heads=16, - attention_bias=False, - sliding_window=72, - intermediate_size=3072, - hidden_act="silu", - layer_scale_initial_scale=0.01, - rms_norm_eps=1e-5, - num_hidden_layers=8, - num_quantizers=16, - upsample_rates=(8, 5, 4, 3), - upsampling_ratios=(2, 2), - decoder_dim=1536, - attention_dropout=0.0, - initializer_range=0.02, - **kwargs, - ): - self.codebook_size = codebook_size - self.hidden_size = hidden_size - self.max_position_embeddings = max_position_embeddings - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.attention_bias = attention_bias - self.sliding_window = sliding_window - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.layer_scale_initial_scale = layer_scale_initial_scale - self.rms_norm_eps = rms_norm_eps - self.num_hidden_layers = num_hidden_layers - self.num_quantizers = num_quantizers - self.upsample_rates = upsample_rates - self.upsampling_ratios = upsampling_ratios - self.decoder_dim = decoder_dim - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.rope_parameters = rope_parameters - - super().__init__(**kwargs) + codebook_size: int = 2048 + hidden_size: int = 1024 + max_position_embeddings: int = 8000 + rope_parameters: RopeParameters | dict | None = None + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + attention_bias: bool = False + sliding_window: int = 72 + intermediate_size: int = 3072 + hidden_act: str = "silu" + layer_scale_initial_scale: float = 0.01 + rms_norm_eps: float = 1e-5 + num_hidden_layers: int = 8 + num_quantizers: int = 16 + upsample_rates: list[int] | tuple[int, ...] = (8, 5, 4, 3) + upsampling_ratios: list[int] | tuple[int, ...] = (2, 2) + decoder_dim: int = 1536 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 @property def layer_types(self): @@ -693,6 +458,7 @@ def layer_types(self): @auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base") +@strict(accept_kwargs=True) class Qwen3OmniMoeConfig(PreTrainedConfig): r""" thinker_config (`dict`, *optional*): Configuration of the underlying thinker sub-model. @@ -736,48 +502,39 @@ class Qwen3OmniMoeConfig(PreTrainedConfig): "code2wav_config": Qwen3OmniMoeCode2WavConfig, } - def __init__( - self, - thinker_config=None, - talker_config=None, - code2wav_config=None, - enable_audio_output=True, - im_start_token_id=151644, - im_end_token_id=151645, - tts_pad_token_id=151671, - tts_bos_token_id=151672, - tts_eos_token_id=151673, - system_token_id=8948, - user_token_id=872, - assistant_token_id=77091, - **kwargs, - ): - if thinker_config is None: - thinker_config = {} + thinker_config: dict | PreTrainedConfig | None = None + talker_config: dict | PreTrainedConfig | None = None + code2wav_config: dict | PreTrainedConfig | None = None + enable_audio_output: bool = True + im_start_token_id: int = 151644 + im_end_token_id: int = 151645 + tts_pad_token_id: int = 151671 + tts_bos_token_id: int = 151672 + tts_eos_token_id: int = 151673 + system_token_id: int = 8948 + user_token_id: int = 872 + assistant_token_id: int = 77091 + + def __post_init__(self, **kwargs): + if self.thinker_config is None: + self.thinker_config = Qwen3OmniMoeThinkerConfig() logger.info("thinker_config is None. Initializing thinker model with default values") + elif isinstance(self.thinker_config, dict): + self.thinker_config = Qwen3OmniMoeThinkerConfig(**self.thinker_config) - if talker_config is None: - talker_config = {} + if self.talker_config is None: + self.talker_config = Qwen3OmniMoeTalkerConfig() logger.info("talker_config is None. Initializing talker model with default values") + elif isinstance(self.talker_config, dict): + self.talker_config = Qwen3OmniMoeTalkerConfig(**self.talker_config) + + if self.code2wav_config is None: + self.code2wav_config = Qwen3OmniMoeCode2WavConfig() + logger.info("code2wav_config is None. Initializing code2wav_config model with default values") + elif isinstance(self.code2wav_config, dict): + self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**self.code2wav_config) - if code2wav_config is None: - code2wav_config = {} - logger.info("code2wav_config is None. Initializing code2wav model with default values") - - self.thinker_config = Qwen3OmniMoeThinkerConfig(**thinker_config) - self.talker_config = Qwen3OmniMoeTalkerConfig(**talker_config) - self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**code2wav_config) - self.initializer_range = self.thinker_config.initializer_range - self.enable_audio_output = enable_audio_output - self.im_start_token_id = im_start_token_id - self.im_end_token_id = im_end_token_id - self.tts_pad_token_id = tts_pad_token_id - self.tts_bos_token_id = tts_bos_token_id - self.tts_eos_token_id = tts_eos_token_id - self.system_token_id = system_token_id - self.user_token_id = user_token_id - self.assistant_token_id = assistant_token_id - super().__init__(**kwargs) + super().__post_init__(**kwargs) def get_text_config(self, decoder=False) -> "PreTrainedConfig": """ @@ -2482,7 +2239,7 @@ def generate( use_audio_in_video: bool = False, return_audio: bool | None = None, thinker_max_new_tokens: int = 1024, - thinker_eos_token_id: int = 151645, + thinker_eos_token_id: int | list[int] | None = 151645, talker_max_new_tokens: int = 4096, talker_do_sample: bool = True, talker_top_k: int = 50, diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index 69a411fe0783..d20d85ec7945 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="Qwen/Qwen3-VL-4B-Instruct") +@strict(accept_kwargs=True) class Qwen3VLVisionConfig(PreTrainedConfig): r""" num_position_embeddings (`int`, *optional*, defaults to 2304): @@ -36,41 +39,23 @@ class Qwen3VLVisionConfig(PreTrainedConfig): model_type = "qwen3_vl" base_config_key = "vision_config" - def __init__( - self, - depth=27, - hidden_size=1152, - hidden_act="gelu_pytorch_tanh", - intermediate_size=4304, - num_heads=16, - in_channels=3, - patch_size=16, - spatial_merge_size=2, - temporal_patch_size=2, - out_hidden_size=3584, - num_position_embeddings=2304, - deepstack_visual_indexes=[8, 16, 24], - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.out_hidden_size = out_hidden_size - self.num_position_embeddings = num_position_embeddings - self.initializer_range = initializer_range - self.deepstack_visual_indexes = deepstack_visual_indexes + depth: int = 27 + hidden_size: int = 1152 + hidden_act: str = "gelu_pytorch_tanh" + intermediate_size: int = 4304 + num_heads: int = 16 + in_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 16 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + out_hidden_size: int = 3584 + num_position_embeddings: int = 2304 + deepstack_visual_indexes: list[int] | tuple[int, ...] = (8, 16, 24) + initializer_range: float = 0.02 @auto_docstring(checkpoint="Qwen/Qwen3-VL-4B-Instruct") +@strict(accept_kwargs=True) class Qwen3VLTextConfig(PreTrainedConfig): r""" Example: @@ -91,56 +76,34 @@ class Qwen3VLTextConfig(PreTrainedConfig): model_type = "qwen3_vl_text" base_config_key = "text_config" default_theta = 500000.0 - - def __init__( - self, - vocab_size: int | None = 151936, - hidden_size: int | None = 4096, - intermediate_size: int | None = 22016, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 32, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 128000, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-6, - use_cache: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.head_dim = head_dim - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - - super().__init__( - ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, - **kwargs, - ) + ignore_keys_at_rope_validation = {"mrope_section", "mrope_interleaved"} + + vocab_size: int = 151936 + hidden_size: int = 4096 + intermediate_size: int = 22016 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 32 + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 128000 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3-VL-4B-Instruct") +@strict(accept_kwargs=True) class Qwen3VLConfig(PreTrainedConfig): r""" Example: @@ -162,33 +125,26 @@ class Qwen3VLConfig(PreTrainedConfig): sub_configs = {"vision_config": Qwen3VLVisionConfig, "text_config": Qwen3VLTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151655, - video_token_id=151656, - vision_start_token_id=151652, - vision_end_token_id=151653, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151655 + video_token_id: int = 151656 + vision_start_token_id: int = 151652 + vision_end_token_id: int = 151653 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: self.text_config = self.sub_configs["text_config"]() - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.vision_start_token_id = vision_start_token_id - self.vision_end_token_id = vision_end_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Qwen3VLConfig", "Qwen3VLTextConfig"] diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 398b9ac24a1f..3d6aefae3d8a 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -21,6 +21,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...activations import ACT2FN @@ -79,6 +80,7 @@ class BaseModelOutputWithDeepstackFeatures(BaseModelOutputWithPooling): @auto_docstring(checkpoint="Qwen/Qwen3-VL-4B-Instruct") +@strict(accept_kwargs=True) class Qwen3VLVisionConfig(PreTrainedConfig): r""" num_position_embeddings (`int`, *optional*, defaults to 2304): @@ -92,41 +94,23 @@ class Qwen3VLVisionConfig(PreTrainedConfig): model_type = "qwen3_vl" base_config_key = "vision_config" - def __init__( - self, - depth=27, - hidden_size=1152, - hidden_act="gelu_pytorch_tanh", - intermediate_size=4304, - num_heads=16, - in_channels=3, - patch_size=16, - spatial_merge_size=2, - temporal_patch_size=2, - out_hidden_size=3584, - num_position_embeddings=2304, - deepstack_visual_indexes=[8, 16, 24], - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.out_hidden_size = out_hidden_size - self.num_position_embeddings = num_position_embeddings - self.initializer_range = initializer_range - self.deepstack_visual_indexes = deepstack_visual_indexes + depth: int = 27 + hidden_size: int = 1152 + hidden_act: str = "gelu_pytorch_tanh" + intermediate_size: int = 4304 + num_heads: int = 16 + in_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 16 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + out_hidden_size: int = 3584 + num_position_embeddings: int = 2304 + deepstack_visual_indexes: list[int] | tuple[int, ...] = (8, 16, 24) + initializer_range: float = 0.02 @auto_docstring(checkpoint="Qwen/Qwen3-VL-4B-Instruct") +@strict(accept_kwargs=True) class Qwen3VLTextConfig(PreTrainedConfig): r""" Example: @@ -147,56 +131,34 @@ class Qwen3VLTextConfig(PreTrainedConfig): model_type = "qwen3_vl_text" base_config_key = "text_config" default_theta = 500000.0 - - def __init__( - self, - vocab_size: int | None = 151936, - hidden_size: int | None = 4096, - intermediate_size: int | None = 22016, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 32, - head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 128000, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-6, - use_cache: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.head_dim = head_dim - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - self.pad_token_id = pad_token_id - - super().__init__( - ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, - **kwargs, - ) + ignore_keys_at_rope_validation = {"mrope_section", "mrope_interleaved"} + + vocab_size: int = 151936 + hidden_size: int = 4096 + intermediate_size: int = 22016 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = 32 + head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 128000 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3-VL-4B-Instruct") +@strict(accept_kwargs=True) class Qwen3VLConfig(PreTrainedConfig): r""" Example: @@ -218,33 +180,26 @@ class Qwen3VLConfig(PreTrainedConfig): sub_configs = {"vision_config": Qwen3VLVisionConfig, "text_config": Qwen3VLTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151655, - video_token_id=151656, - vision_start_token_id=151652, - vision_end_token_id=151653, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151655 + video_token_id: int = 151656 + vision_start_token_id: int = 151652 + vision_end_token_id: int = 151653 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: self.text_config = self.sub_configs["text_config"]() - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.vision_start_token_id = vision_start_token_id - self.vision_end_token_id = vision_end_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) class Qwen3VLVisionMLP(nn.Module): diff --git a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py index 89da9eb48f63..7ce9be35e7e2 100644 --- a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="Qwen/Qwen3-VL-30B-A3B-Instruct") +@strict(accept_kwargs=True) class Qwen3VLMoeTextConfig(PreTrainedConfig): r""" decoder_sparse_step (`int`, *optional*, defaults to 1): @@ -46,9 +49,11 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): ```""" model_type = "qwen3_vl_moe_text" - base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] - default_theta = 500000.0 + + attribute_map = { + "num_experts": "num_local_experts", + } # Default tensor parallel plan for base model `Qwen3VLMoe` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -65,67 +70,48 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 151936, - hidden_size: int | None = 2048, - intermediate_size: int | None = 5632, - num_hidden_layers: int | None = 24, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 16, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 128000, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-6, - use_cache: bool | None = True, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - decoder_sparse_step: int | None = 1, - moe_intermediate_size: int | None = 1408, - num_experts_per_tok: int | None = 4, - num_experts: int | None = 60, - mlp_only_layers: list[int] | None = None, - rope_parameters: RopeParameters | None = None, - head_dim: int | None = None, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.head_dim = head_dim or hidden_size // num_attention_heads - self.rope_parameters = rope_parameters - - # MoE arguments - self.decoder_sparse_step = decoder_sparse_step - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers - self.pad_token_id = pad_token_id - - super().__init__( - ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, - **kwargs, - ) + vocab_size: int = 151936 + hidden_size: int = 2048 + + intermediate_size: int = 5632 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + hidden_act: str = "silu" + max_position_embeddings: int = 128000 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + decoder_sparse_step: int = 1 + moe_intermediate_size: int = 1408 + num_experts_per_tok: int = 4 + num_experts: int = 60 + router_aux_loss_coef: float = 0.001 + mlp_only_layers: list[int] | None = None + pad_token_id: int | None = None + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + base_config_key = "text_config" + default_theta = 500000.0 + ignore_keys_at_rope_validation = {"mrope_section", "mrope_interleaved"} + head_dim: int | None = None + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + self.head_dim = self.head_dim or self.hidden_size // self.num_attention_heads + self.sliding_window = None + self.mlp_only_layers = [] if self.mlp_only_layers is None else self.mlp_only_layers + super().__post_init__(**kwargs) @auto_docstring(checkpoint="Qwen/Qwen3-VL-30B-A3B-Instruct") +@strict(accept_kwargs=True) class Qwen3VLMoeVisionConfig(PreTrainedConfig): r""" num_position_embeddings (`int`, *optional*, defaults to 2304): @@ -139,41 +125,23 @@ class Qwen3VLMoeVisionConfig(PreTrainedConfig): model_type = "qwen3_vl_moe" base_config_key = "vision_config" - def __init__( - self, - depth=27, - hidden_size=1152, - hidden_act="gelu_pytorch_tanh", - intermediate_size=4304, - num_heads=16, - in_channels=3, - patch_size=16, - spatial_merge_size=2, - temporal_patch_size=2, - out_hidden_size=3584, - num_position_embeddings=2304, - deepstack_visual_indexes=[8, 16, 24], - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.out_hidden_size = out_hidden_size - self.num_position_embeddings = num_position_embeddings - self.initializer_range = initializer_range - self.deepstack_visual_indexes = deepstack_visual_indexes + depth: int = 27 + hidden_size: int = 1152 + hidden_act: str = "gelu_pytorch_tanh" + intermediate_size: int = 4304 + num_heads: int = 16 + in_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 16 + spatial_merge_size: int = 2 + temporal_patch_size: int | list[int] | tuple[int, int] = 2 + out_hidden_size: int = 3584 + num_position_embeddings: int = 2304 + deepstack_visual_indexes: list[int] | tuple[int, ...] = (8, 16, 24) + initializer_range: float = 0.02 @auto_docstring(checkpoint="Qwen/Qwen3-VL-30B-A3B-Instruct") +@strict(accept_kwargs=True) class Qwen3VLMoeConfig(PreTrainedConfig): r""" Example: @@ -195,33 +163,26 @@ class Qwen3VLMoeConfig(PreTrainedConfig): sub_configs = {"vision_config": Qwen3VLMoeVisionConfig, "text_config": Qwen3VLMoeTextConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151655, - video_token_id=151656, - vision_start_token_id=151652, - vision_end_token_id=151653, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151655 + video_token_id: int = 151656 + vision_start_token_id: int = 151652 + vision_end_token_id: int = 151653 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - if isinstance(text_config, dict): - self.text_config = self.sub_configs["text_config"](**text_config) - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config = self.sub_configs["text_config"](**self.text_config) + elif self.text_config is None: self.text_config = self.sub_configs["text_config"]() - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.vision_start_token_id = vision_start_token_id - self.vision_end_token_id = vision_end_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Qwen3VLMoeConfig", "Qwen3VLMoeTextConfig"] diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index b1219517eb18..f0f862391b0f 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -16,18 +16,18 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging from ...utils.output_capturing import OutputRecorder +from ..qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig from ..qwen3_moe.modeling_qwen3_moe import ( Qwen3MoeDecoderLayer, Qwen3MoeExperts, @@ -54,7 +54,8 @@ @auto_docstring(checkpoint="Qwen/Qwen3-VL-30B-A3B-Instruct") -class Qwen3VLMoeTextConfig(PreTrainedConfig): +@strict(accept_kwargs=True) +class Qwen3VLMoeTextConfig(Qwen3MoeConfig): r""" decoder_sparse_step (`int`, *optional*, defaults to 1): The frequency of the MoE layer. @@ -95,73 +96,41 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + ignore_keys_at_rope_validation = {"mrope_section", "mrope_interleaved"} - def __init__( - self, - vocab_size: int | None = 151936, - hidden_size: int | None = 2048, - intermediate_size: int | None = 5632, - num_hidden_layers: int | None = 24, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 16, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 128000, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-6, - use_cache: bool | None = True, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - decoder_sparse_step: int | None = 1, - moe_intermediate_size: int | None = 1408, - num_experts_per_tok: int | None = 4, - num_experts: int | None = 60, - mlp_only_layers: list[int] | None = None, - rope_parameters: RopeParameters | None = None, - head_dim: int | None = None, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.head_dim = head_dim or hidden_size // num_attention_heads - self.rope_parameters = rope_parameters - - # MoE arguments - self.decoder_sparse_step = decoder_sparse_step - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers - self.pad_token_id = pad_token_id - - super().__init__( - ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, - **kwargs, - ) + intermediate_size: int = 5632 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + max_position_embeddings: int = 128000 + moe_intermediate_size: int = 1408 + num_experts_per_tok: int = 4 + num_experts: int = 60 + head_dim: int | None = None + tie_word_embeddings: bool = True + + norm_topk_prob = AttributeError() + output_router_logits = AttributeError() + use_sliding_window = AttributeError() + sliding_window = AttributeError() + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + self.head_dim = self.head_dim or self.hidden_size // self.num_attention_heads + super().__post_init__(**kwargs) + self.sliding_window = None @auto_docstring(checkpoint="Qwen/Qwen3-VL-30B-A3B-Instruct") +@strict(accept_kwargs=True) class Qwen3VLMoeVisionConfig(Qwen3VLVisionConfig): pass @auto_docstring(checkpoint="Qwen/Qwen3-VL-30B-A3B-Instruct") +@strict(accept_kwargs=True) class Qwen3VLMoeConfig(Qwen3VLConfig): r""" Example: @@ -179,8 +148,7 @@ class Qwen3VLMoeConfig(Qwen3VLConfig): >>> configuration = model.config ```""" - model_type = "qwen3_vl_moe" - sub_configs = {"vision_config": Qwen3VLMoeVisionConfig, "text_config": Qwen3VLMoeTextConfig} + pass class Qwen3VLMoeTextRMSNorm(Qwen3MoeRMSNorm): diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py index f9908b3200f9..6856292e3e61 100644 --- a/src/transformers/models/rag/configuration_rag.py +++ b/src/transformers/models/rag/configuration_rag.py @@ -13,11 +13,15 @@ # limitations under the License. """RAG model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring +from ..auto.configuration_auto import AutoConfig @auto_docstring(checkpoint="") +@strict(accept_kwargs=True) class RagConfig(PreTrainedConfig): r""" title_sep (`str`, *optional*, defaults to `" / "`): @@ -74,88 +78,50 @@ class RagConfig(PreTrainedConfig): model_type = "rag" has_no_defaults_at_init = True - def __init__( - self, - vocab_size=None, - is_encoder_decoder=True, - prefix=None, - bos_token_id=None, - pad_token_id=None, - eos_token_id=None, - decoder_start_token_id=None, - title_sep=" / ", - doc_sep=" // ", - n_docs=5, - max_combined_length=300, - retrieval_vector_size=768, - retrieval_batch_size=8, - dataset="wiki_dpr", - dataset_split="train", - index_name="compressed", - index_path=None, - passages_path=None, - use_dummy_dataset=False, - reduce_loss=False, - label_smoothing=0.0, - do_deduplication=True, - exclude_bos_score=False, - do_marginalize=False, - output_retrieved=False, - use_cache=True, - dataset_revision=None, - **kwargs, - ): - self.bos_token_id = bos_token_id - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.prefix = prefix - self.vocab_size = vocab_size - super().__init__( - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) + vocab_size: int | None = None + is_encoder_decoder: bool = True + prefix: str | None = None + bos_token_id: int | None = None + pad_token_id: int | None = None + eos_token_id: int | list[int] | None = None + decoder_start_token_id: int | None = None + title_sep: str = " / " + doc_sep: str = " // " + n_docs: int = 5 + max_combined_length: int = 300 + retrieval_vector_size: int = 768 + retrieval_batch_size: int = 8 + dataset: str = "wiki_dpr" + dataset_split: str = "train" + index_name: str = "compressed" + index_path: str | None = None + passages_path: str | None = None + use_dummy_dataset: bool = False + reduce_loss: bool = False + label_smoothing: float = 0.0 + do_deduplication: bool = True + exclude_bos_score: bool = False + do_marginalize: bool = False + output_retrieved: bool = False + use_cache: bool = True + dataset_revision: str | None = None + + def __post_init__(self, **kwargs): if "question_encoder" not in kwargs or "generator" not in kwargs: raise ValueError( - f"A configuration of type {self.model_type} cannot be instantiated because " - f"both `question_encoder` and `generator` sub-configurations were not passed, only {kwargs}" + f"A configuration of type {self.model_type} cannot be instantiated because not both `question_encoder` and" + f" `generator` sub-configurations are passed, but only {kwargs}" ) + question_encoder_config = kwargs.pop("question_encoder") question_encoder_model_type = question_encoder_config.pop("model_type") decoder_config = kwargs.pop("generator") decoder_model_type = decoder_config.pop("model_type") - from ..auto.configuration_auto import AutoConfig - self.question_encoder = AutoConfig.for_model(question_encoder_model_type, **question_encoder_config) self.generator = AutoConfig.for_model(decoder_model_type, **decoder_config) - self.reduce_loss = reduce_loss - self.label_smoothing = label_smoothing - self.exclude_bos_score = exclude_bos_score - self.do_marginalize = do_marginalize - - self.title_sep = title_sep - self.doc_sep = doc_sep - self.n_docs = n_docs - self.max_combined_length = max_combined_length - - self.dataset = dataset - self.dataset_split = dataset_split - self.index_name = index_name - - self.retrieval_vector_size = retrieval_vector_size - self.retrieval_batch_size = retrieval_batch_size - self.passages_path = passages_path - self.index_path = index_path - self.use_dummy_dataset = use_dummy_dataset - self.dataset_revision = dataset_revision - - self.output_retrieved = output_retrieved - - self.do_deduplication = do_deduplication - - self.use_cache = use_cache + super().__post_init__(**kwargs) @classmethod def from_question_encoder_generator_configs( diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index a67634389ffd..61b5402c60c7 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -13,15 +13,15 @@ # limitations under the License. """RecurrentGemma model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="google/recurrentgemma-2b") +@strict(accept_kwargs=True) class RecurrentGemmaConfig(PreTrainedConfig): r""" lru_width (`int` or `None`, *optional*): @@ -54,60 +54,44 @@ class RecurrentGemmaConfig(PreTrainedConfig): model_type = "recurrent_gemma" - def __init__( - self, - num_hidden_layers: int | None = 26, - vocab_size: int | None = 256000, - hidden_size: int | None = 2560, - intermediate_size: int | None = 3 * 2560, - num_attention_heads: int | None = 10, - lru_width: int | None = None, - attention_window_size: int | None = 2048, - conv1d_width: int | None = 4, - logits_soft_cap: float | None = 30.0, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - hidden_activation: str | None = "gelu_pytorch_tanh", - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - block_types: list[str] | None = ("recurrent", "recurrent", "attention"), - attention_dropout: float | None = 0.0, - num_key_value_heads: int | None = None, - attention_bias: str | None = False, - w_init_variance_scale: float | None = 0.01, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.num_hidden_layers = num_hidden_layers - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_attention_heads = num_attention_heads - self.lru_width = lru_width if lru_width is not None else hidden_size - self.attention_window_size = attention_window_size - self.conv1d_width = conv1d_width - self.logits_soft_cap = logits_soft_cap - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.block_types = list(block_types) - self.hidden_activation = hidden_activation + num_hidden_layers: int = 26 + vocab_size: int = 256000 + hidden_size: int = 2560 + intermediate_size: int = 3 * 2560 + num_attention_heads: int = 10 + lru_width: int | None = None + attention_window_size: int = 2048 + conv1d_width: int = 4 + logits_soft_cap: float = 30.0 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + bos_token_id: int | None = 2 + hidden_activation: str = "gelu_pytorch_tanh" + rope_parameters: RopeParameters | dict | None = None + block_types: list[str] | tuple[str, ...] | None = ("recurrent", "recurrent", "attention") + attention_dropout: float | int = 0.0 + num_key_value_heads: int | None = None + attention_bias: bool = False + w_init_variance_scale: float = 0.01 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.lru_width = self.lru_width if self.lru_width is not None else self.hidden_size + self.block_types = list(self.block_types) self.head_dim = self.hidden_size // self.num_attention_heads - self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads - if self.num_key_value_heads > self.num_attention_heads: - raise ValueError("The number of `num_key_value_heads` must be smaller than `num_attention_heads`") - self.attention_dropout = attention_dropout - self.attention_bias = attention_bias - self.w_init_variance_scale = w_init_variance_scale + self.num_key_value_heads = ( + self.num_key_value_heads if self.num_key_value_heads is not None else self.num_attention_heads + ) self.final_w_init_variance_scale = 2.0 / self.num_hidden_layers - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.rope_parameters = rope_parameters kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC - super().__init__(**kwargs) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.num_key_value_heads > self.num_attention_heads: + raise ValueError("The number of `num_key_value_heads` must be smaller than `num_attention_heads`") @property def layers_block_type(self): diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py index c1840b3dec82..0bf747c3405f 100644 --- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -658,7 +658,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -792,7 +792,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True outputs = self.model( input_ids=input_ids, diff --git a/src/transformers/models/reformer/configuration_reformer.py b/src/transformers/models/reformer/configuration_reformer.py index cf38bd0aa213..c018bc940f99 100755 --- a/src/transformers/models/reformer/configuration_reformer.py +++ b/src/transformers/models/reformer/configuration_reformer.py @@ -14,14 +14,14 @@ # limitations under the License. """Reformer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/reformer-crime-and-punishment") +@strict(accept_kwargs=True) class ReformerConfig(PreTrainedConfig): r""" attention_head_size (`int`, *optional*, defaults to 64): @@ -113,80 +113,46 @@ class ReformerConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_buckets_states"] attribute_map = {} - def __init__( - self, - attention_head_size=64, - attn_layers=["local", "lsh", "local", "lsh", "local", "lsh"], - axial_norm_std=1.0, - axial_pos_embds=True, - axial_pos_shape=[64, 64], - axial_pos_embds_dim=[64, 192], - chunk_size_lm_head=0, - eos_token_id=2, - feed_forward_size=512, - hash_seed=None, - hidden_act="relu", - hidden_dropout_prob=0.05, - hidden_size=256, - initializer_range=0.02, - is_decoder=False, - layer_norm_eps=1e-12, - local_num_chunks_before=1, - local_num_chunks_after=0, - local_attention_probs_dropout_prob=0.05, - local_attn_chunk_length=64, - lsh_attn_chunk_length=64, - lsh_attention_probs_dropout_prob=0.0, - lsh_num_chunks_before=1, - lsh_num_chunks_after=0, - max_position_embeddings=4096, - num_attention_heads=12, - num_buckets=None, - num_hashes=1, - pad_token_id=0, - vocab_size=320, - tie_word_embeddings=False, - use_cache=True, - classifier_dropout=None, - bos_token_id=None, - **kwargs, - ): - self.hash_seed = hash_seed - self.vocab_size = vocab_size - self.attention_head_size = attention_head_size - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.num_hashes = num_hashes - self.num_hidden_layers = len(attn_layers) - self.num_buckets = tuple(num_buckets) if isinstance(num_buckets, list) else num_buckets - self.lsh_attn_chunk_length = lsh_attn_chunk_length - self.local_attn_chunk_length = local_attn_chunk_length - self.lsh_num_chunks_after = lsh_num_chunks_after - self.lsh_num_chunks_before = lsh_num_chunks_before - self.local_num_chunks_after = local_num_chunks_after - self.local_num_chunks_before = local_num_chunks_before - self.hidden_act = hidden_act - self.feed_forward_size = feed_forward_size - self.hidden_dropout_prob = hidden_dropout_prob - self.lsh_attention_probs_dropout_prob = lsh_attention_probs_dropout_prob - self.local_attention_probs_dropout_prob = local_attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.axial_pos_embds = axial_pos_embds - self.axial_pos_shape = tuple(axial_pos_shape) - self.axial_pos_embds_dim = tuple(axial_pos_embds_dim) - self.axial_norm_std = axial_norm_std - self.chunk_size_lm_head = chunk_size_lm_head - self.attn_layers = attn_layers - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.bos_token_id = bos_token_id - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + attention_head_size: int = 64 + attn_layers: list[str] | tuple[str, ...] = ("local", "lsh", "local", "lsh", "local", "lsh") + axial_norm_std: float = 1.0 + axial_pos_embds: bool = True + axial_pos_shape: list[int] | tuple[int, ...] = (64, 64) + axial_pos_embds_dim: list[int] | tuple[int, ...] = (64, 192) + chunk_size_lm_head: int = 0 + eos_token_id: int | None = 2 + feed_forward_size: int = 512 + hash_seed: int | None = None + hidden_act: str = "relu" + hidden_dropout_prob: float = 0.05 + hidden_size: int = 256 + initializer_range: float = 0.02 + is_decoder: bool = False + layer_norm_eps: float = 1e-12 + local_num_chunks_before: int = 1 + local_num_chunks_after: int = 0 + local_attention_probs_dropout_prob: float = 0.05 + local_attn_chunk_length: int = 64 + lsh_attn_chunk_length: int | None = 64 + lsh_attention_probs_dropout_prob: float | None = 0.0 + lsh_num_chunks_before: int | None = 1 + lsh_num_chunks_after: int | None = 0 + max_position_embeddings: int = 4096 + num_attention_heads: int = 12 + num_buckets: int | list[int] | None = None + num_hashes: int = 1 + vocab_size: int = 320 + tie_word_embeddings: bool = False + use_cache: bool = True + classifier_dropout: float | int | None = None + bos_token_id: int | None = None + pad_token_id: int | None = 0 + + def __post_init__(self, **kwargs): + self.num_hidden_layers = len(self.attn_layers) + self.axial_pos_shape = tuple(self.axial_pos_shape) + self.axial_pos_embds_dim = tuple(self.axial_pos_embds_dim) + super().__post_init__(**kwargs) __all__ = ["ReformerConfig"] diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index 6fd05df1c666..648abb955194 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -1983,7 +1983,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -2209,7 +2209,7 @@ def forward( config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict reformer_outputs = self.reformer( input_ids, @@ -2375,7 +2375,7 @@ def forward( >>> loss = round(outputs.loss.item(), 2) ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict reformer_outputs = self.reformer( input_ids, @@ -2492,7 +2492,7 @@ def forward( >>> loss = model(**inputs, labels=labels).loss ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.reformer( input_ids, @@ -2609,7 +2609,7 @@ def forward( For more information, see `num_hashes` in [`ReformerConfig`]. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict reformer_outputs = self.reformer( input_ids, diff --git a/src/transformers/models/regnet/configuration_regnet.py b/src/transformers/models/regnet/configuration_regnet.py index a39233bdb200..34deddfad1f5 100644 --- a/src/transformers/models/regnet/configuration_regnet.py +++ b/src/transformers/models/regnet/configuration_regnet.py @@ -13,14 +13,14 @@ # limitations under the License. """RegNet model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/regnet-y-040") +@strict(accept_kwargs=True) class RegNetConfig(PreTrainedConfig): r""" layer_type (`str`, *optional*, defaults to `"y"`): @@ -48,29 +48,19 @@ class RegNetConfig(PreTrainedConfig): model_type = "regnet" layer_types = ["x", "y"] - def __init__( - self, - num_channels=3, - embedding_size=32, - hidden_sizes=[128, 192, 512, 1088], - depths=[2, 6, 12, 2], - groups_width=64, - layer_type="y", - hidden_act="relu", - **kwargs, - ): - super().__init__(**kwargs) - if layer_type not in self.layer_types: - raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}") - self.num_channels = num_channels - self.embedding_size = embedding_size - self.hidden_sizes = hidden_sizes - self.depths = depths - self.groups_width = groups_width - self.layer_type = layer_type - self.hidden_act = hidden_act - # always downsample in the first stage - self.downsample_in_first_stage = True + num_channels: int = 3 + embedding_size: int = 32 + hidden_sizes: list[int] | tuple[int, ...] = (128, 192, 512, 1088) + depths: list[int] | tuple[int, ...] = (2, 6, 12, 2) + groups_width: int = 64 + layer_type: str = "y" + hidden_act: str = "relu" + downsample_in_first_stage: bool = True + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.layer_type not in self.layer_types: + raise ValueError(f"layer_type={self.layer_type} is not one of {','.join(self.layer_types)}") __all__ = ["RegNetConfig"] diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py index f8db0b166b92..6c43d79d7894 100644 --- a/src/transformers/models/regnet/modeling_regnet.py +++ b/src/transformers/models/regnet/modeling_regnet.py @@ -305,7 +305,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict embedding_output = self.embedder(pixel_values) @@ -361,7 +361,7 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.regnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) diff --git a/src/transformers/models/rembert/configuration_rembert.py b/src/transformers/models/rembert/configuration_rembert.py index f0ea6fbd3f5b..7a6b51e0f6d4 100644 --- a/src/transformers/models/rembert/configuration_rembert.py +++ b/src/transformers/models/rembert/configuration_rembert.py @@ -13,14 +13,14 @@ # limitations under the License. """RemBERT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/rembert") +@strict(accept_kwargs=True) class RemBertConfig(PreTrainedConfig): r""" input_embedding_size (`int`, *optional*, defaults to 256): @@ -45,55 +45,28 @@ class RemBertConfig(PreTrainedConfig): model_type = "rembert" - def __init__( - self, - vocab_size=250300, - hidden_size=1152, - num_hidden_layers=32, - num_attention_heads=18, - input_embedding_size=256, - output_embedding_size=1664, - intermediate_size=4608, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - classifier_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - use_cache=True, - pad_token_id=0, - bos_token_id=312, - eos_token_id=313, - is_decoder=False, - add_cross_attention=False, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.input_embedding_size = input_embedding_size - self.output_embedding_size = output_embedding_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.classifier_dropout_prob = classifier_dropout_prob - self.initializer_range = initializer_range - self.type_vocab_size = type_vocab_size - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.tie_word_embeddings = False + vocab_size: int = 250300 + hidden_size: int = 1152 + num_hidden_layers: int = 32 + num_attention_heads: int = 18 + input_embedding_size: int = 256 + output_embedding_size: int = 1664 + intermediate_size: int = 4608 + hidden_act: str = "gelu" + hidden_dropout_prob: float | int = 0.0 + attention_probs_dropout_prob: float | int = 0.0 + classifier_dropout_prob: float | int = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + use_cache: bool = True + pad_token_id: int | None = 0 + bos_token_id: int | None = 312 + eos_token_id: int | None = 313 + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = False __all__ = ["RemBertConfig"] diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py index 666aa080cc22..be037cc4f9e7 100755 --- a/src/transformers/models/rembert/modeling_rembert.py +++ b/src/transformers/models/rembert/modeling_rembert.py @@ -550,7 +550,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -672,7 +672,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.rembert( input_ids, @@ -772,7 +772,7 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.rembert( input_ids, @@ -849,7 +849,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.rembert( input_ids, @@ -957,7 +957,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1035,7 +1035,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.rembert( input_ids, @@ -1098,7 +1098,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.rembert( input_ids, diff --git a/src/transformers/models/resnet/configuration_resnet.py b/src/transformers/models/resnet/configuration_resnet.py index 5a2b7a7908d1..c862f6b4934b 100644 --- a/src/transformers/models/resnet/configuration_resnet.py +++ b/src/transformers/models/resnet/configuration_resnet.py @@ -13,15 +13,17 @@ # limitations under the License. """ResNet model configuration""" -from ...backbone_utils import BackboneConfigMixin -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from typing import ClassVar +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...backbone_utils import BackboneConfigMixin +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/resnet-50") +@strict(accept_kwargs=True) class ResNetConfig(BackboneConfigMixin, PreTrainedConfig): r""" layer_type (`str`, *optional*, defaults to `"bottleneck"`): @@ -48,35 +50,29 @@ class ResNetConfig(BackboneConfigMixin, PreTrainedConfig): """ model_type = "resnet" - layer_types = ["basic", "bottleneck"] + layer_types: ClassVar[list[str]] = ["basic", "bottleneck"] + + num_channels: int = 3 + embedding_size: int = 64 + hidden_sizes: list[int] | tuple[int, ...] | None = (256, 512, 1024, 2048) + depths: list[int] | tuple[int, ...] | None = (3, 4, 6, 3) + layer_type: str = "bottleneck" + hidden_act: str = "relu" + downsample_in_first_stage: bool = False + downsample_in_bottleneck: bool = False + + def __post_init__(self, **kwargs): + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + self.hidden_sizes = list(self.hidden_sizes) + super().__post_init__(**kwargs) - def __init__( - self, - num_channels=3, - embedding_size=64, - hidden_sizes=[256, 512, 1024, 2048], - depths=[3, 4, 6, 3], - layer_type="bottleneck", - hidden_act="relu", - downsample_in_first_stage=False, - downsample_in_bottleneck=False, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) - if layer_type not in self.layer_types: - raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}") - self.num_channels = num_channels - self.embedding_size = embedding_size - self.hidden_sizes = hidden_sizes - self.depths = depths - self.layer_type = layer_type - self.hidden_act = hidden_act - self.downsample_in_first_stage = downsample_in_first_stage - self.downsample_in_bottleneck = downsample_in_bottleneck - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + def validate_layer_type(self): + """Check that `layer_types` is correctly defined.""" + if self.layer_type not in self.layer_types: + raise ValueError(f"layer_type={self.layer_type} is not one of {','.join(self.layer_types)}") __all__ = ["ResNetConfig"] diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py index 283d6f80c818..9e4ed8aecfa7 100644 --- a/src/transformers/models/resnet/modeling_resnet.py +++ b/src/transformers/models/resnet/modeling_resnet.py @@ -293,7 +293,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict embedding_output = self.embedder(pixel_values) @@ -348,7 +348,7 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.resnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) @@ -422,7 +422,7 @@ def forward( >>> list(feature_maps[-1].shape) [1, 2048, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/roberta/configuration_roberta.py b/src/transformers/models/roberta/configuration_roberta.py index 305b6681f9b6..0c064b81b030 100644 --- a/src/transformers/models/roberta/configuration_roberta.py +++ b/src/transformers/models/roberta/configuration_roberta.py @@ -14,14 +14,14 @@ # limitations under the License. """RoBERTa configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="FacebookAI/roberta-base") +@strict(accept_kwargs=True) class RobertaConfig(PreTrainedConfig): r""" Examples: @@ -41,52 +41,26 @@ class RobertaConfig(PreTrainedConfig): model_type = "roberta" - def __init__( - self, - vocab_size=50265, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - use_cache=True, - classifier_dropout=None, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout + vocab_size: int = 50265 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + use_cache: bool = True + classifier_dropout: float | int | None = None + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["RobertaConfig"] diff --git a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py index b9bd8ec65abc..04935b8f23d0 100644 --- a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py @@ -14,13 +14,13 @@ # limitations under the License. """RoBERTa-PreLayerNorm configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring +@strict(accept_kwargs=True) @auto_docstring(checkpoint="andreasmadsen/efficient_mlm_m0.40") # Copied from transformers.models.roberta.configuration_roberta.RobertaConfig with FacebookAI/roberta-base->andreasmadsen/efficient_mlm_m0.40,RoBERTa->RoBERTa-PreLayerNorm,Roberta->RobertaPreLayerNorm,roberta->roberta-prelayernorm class RobertaPreLayerNormConfig(PreTrainedConfig): @@ -42,52 +42,26 @@ class RobertaPreLayerNormConfig(PreTrainedConfig): model_type = "roberta-prelayernorm" - def __init__( - self, - vocab_size=50265, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - use_cache=True, - classifier_dropout=None, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout + vocab_size: int = 50265 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + use_cache: bool = True + classifier_dropout: float | int | None = None + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["RobertaPreLayerNormConfig"] diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py index ed888465ec3e..cf03f19c9423 100644 --- a/src/transformers/models/roc_bert/configuration_roc_bert.py +++ b/src/transformers/models/roc_bert/configuration_roc_bert.py @@ -13,14 +13,14 @@ # limitations under the License. """RoCBert model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="weiweishi/roc-bert-base-zh") +@strict(accept_kwargs=True) class RoCBertConfig(PreTrainedConfig): r""" enable_pronunciation (`bool`, *optional*, defaults to `True`): @@ -59,61 +59,31 @@ class RoCBertConfig(PreTrainedConfig): model_type = "roc_bert" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - use_cache=True, - pad_token_id=0, - classifier_dropout=None, - enable_pronunciation=True, - enable_shape=True, - pronunciation_embed_dim=768, - pronunciation_vocab_size=910, - shape_embed_dim=512, - shape_vocab_size=24858, - concat_input=True, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.type_vocab_size = type_vocab_size - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.enable_pronunciation = enable_pronunciation - self.enable_shape = enable_shape - self.pronunciation_embed_dim = pronunciation_embed_dim - self.pronunciation_vocab_size = pronunciation_vocab_size - self.shape_embed_dim = shape_embed_dim - self.shape_vocab_size = shape_vocab_size - self.concat_input = concat_input - self.classifier_dropout = classifier_dropout - super().__init__(**kwargs) - self.pad_token_id = pad_token_id + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + use_cache: bool = True + pad_token_id: int | None = 0 + classifier_dropout: float | int | None = None + enable_pronunciation: bool = True + enable_shape: bool = True + pronunciation_embed_dim: int = 768 + pronunciation_vocab_size: int = 910 + shape_embed_dim: int = 512 + shape_vocab_size: int = 24858 + concat_input: bool = True + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["RoCBertConfig"] diff --git a/src/transformers/models/roformer/configuration_roformer.py b/src/transformers/models/roformer/configuration_roformer.py index 6cf646548f42..b05188fc450b 100644 --- a/src/transformers/models/roformer/configuration_roformer.py +++ b/src/transformers/models/roformer/configuration_roformer.py @@ -13,14 +13,14 @@ # limitations under the License. """RoFormer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="junnyu/roformer_chinese_base") +@strict(accept_kwargs=True) class RoFormerConfig(PreTrainedConfig): r""" rotary_value (`bool`, *optional*, defaults to `False`): @@ -43,54 +43,31 @@ class RoFormerConfig(PreTrainedConfig): model_type = "roformer" - def __init__( - self, - vocab_size=50000, - embedding_size=None, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=1536, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=None, - eos_token_id=None, - rotary_value=False, - use_cache=True, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings + vocab_size: int = 50000 + embedding_size: int | None = None + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 1536 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + rotary_value: bool = False + use_cache: bool = True + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.embedding_size = hidden_size if embedding_size is None else embedding_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.rotary_value = rotary_value - self.use_cache = use_cache + def __post_init__(self, **kwargs): + self.embedding_size = self.hidden_size if self.embedding_size is None else self.embedding_size + super().__post_init__(**kwargs) __all__ = ["RoFormerConfig"] diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index fb374220a97f..2e522d816354 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -697,7 +697,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -828,7 +828,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.roformer( input_ids, @@ -933,7 +933,7 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.roformer( input_ids, @@ -1029,7 +1029,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.roformer( input_ids, @@ -1128,7 +1128,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1204,7 +1204,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.roformer( input_ids, @@ -1266,7 +1266,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> QuestionAnsweringModelOutput | tuple[torch.Tensor]: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.roformer( input_ids, diff --git a/src/transformers/models/rt_detr/configuration_rt_detr.py b/src/transformers/models/rt_detr/configuration_rt_detr.py index c8716b3b1efe..c36cbf76df4e 100644 --- a/src/transformers/models/rt_detr/configuration_rt_detr.py +++ b/src/transformers/models/rt_detr/configuration_rt_detr.py @@ -13,16 +13,16 @@ # limitations under the License. """RT-DETR model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="PekingU/rtdetr_r50vd") +@strict(accept_kwargs=True) class RTDetrConfig(PreTrainedConfig): r""" initializer_bias_prior_prob (`float`, *optional*): @@ -125,131 +125,67 @@ class RTDetrConfig(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - initializer_range=0.01, - initializer_bias_prior_prob=None, - layer_norm_eps=1e-5, - batch_norm_eps=1e-5, - # backbone - backbone_config=None, - freeze_backbone_batch_norms=True, - # encoder HybridEncoder - encoder_hidden_dim=256, - encoder_in_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - encoder_layers=1, - encoder_ffn_dim=1024, - encoder_attention_heads=8, - dropout=0.0, - activation_dropout=0.0, - encode_proj_layers=[2], - positional_encoding_temperature=10000, - encoder_activation_function="gelu", - activation_function="silu", - eval_size=None, - normalize_before=False, - hidden_expansion=1.0, - # decoder RTDetrTransformer - d_model=256, - num_queries=300, - decoder_in_channels=[256, 256, 256], - decoder_ffn_dim=1024, - num_feature_levels=3, - decoder_n_points=4, - decoder_layers=6, - decoder_attention_heads=8, - decoder_activation_function="relu", - attention_dropout=0.0, - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0, - learn_initial_query=False, - anchor_image_size=None, - disable_custom_kernels=True, - with_box_refine=True, - is_encoder_decoder=True, - # Loss - matcher_alpha=0.25, - matcher_gamma=2.0, - matcher_class_cost=2.0, - matcher_bbox_cost=5.0, - matcher_giou_cost=2.0, - use_focal_loss=True, - auxiliary_loss=True, - focal_loss_alpha=0.75, - focal_loss_gamma=2.0, - weight_loss_vfl=1.0, - weight_loss_bbox=5.0, - weight_loss_giou=2.0, - eos_coefficient=1e-4, - **kwargs, - ): - self.initializer_range = initializer_range - self.initializer_bias_prior_prob = initializer_bias_prior_prob - self.layer_norm_eps = layer_norm_eps - self.batch_norm_eps = batch_norm_eps - - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + initializer_range: float = 0.01 + initializer_bias_prior_prob: float | None = None + layer_norm_eps: float = 1e-5 + batch_norm_eps: float = 1e-5 + backbone_config: dict | PreTrainedConfig | None = None + freeze_backbone_batch_norms: bool = True + encoder_hidden_dim: int = 256 + encoder_in_channels: list[int] | tuple[int, ...] = (512, 1024, 2048) + feat_strides: list[int] | tuple[int, ...] = (8, 16, 32) + encoder_layers: int = 1 + encoder_ffn_dim: int = 1024 + encoder_attention_heads: int = 8 + dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + encode_proj_layers: list[int] | tuple[int, ...] = (2,) + positional_encoding_temperature: int = 10000 + encoder_activation_function: str = "gelu" + activation_function: str = "silu" + eval_size: int | None = None + normalize_before: bool = False + hidden_expansion: float = 1.0 + d_model: int = 256 + num_queries: int = 300 + decoder_in_channels: list[int] | tuple[int, ...] = (256, 256, 256) + decoder_ffn_dim: int = 1024 + num_feature_levels: int = 3 + decoder_n_points: int = 4 + decoder_layers: int = 6 + decoder_attention_heads: int = 8 + decoder_activation_function: str = "relu" + attention_dropout: float | int = 0.0 + num_denoising: int = 100 + label_noise_ratio: float = 0.5 + box_noise_scale: float = 1.0 + learn_initial_query: bool = False + anchor_image_size: int | list[int] | None = None + disable_custom_kernels: bool = True + with_box_refine: bool = True + is_encoder_decoder: bool = True + matcher_alpha: float = 0.25 + matcher_gamma: float = 2.0 + matcher_class_cost: float = 2.0 + matcher_bbox_cost: float = 5.0 + matcher_giou_cost: float = 2.0 + use_focal_loss: bool = True + auxiliary_loss: bool = True + focal_loss_alpha: float = 0.75 + focal_loss_gamma: float = 2.0 + weight_loss_vfl: float = 1.0 + weight_loss_bbox: float = 5.0 + weight_loss_giou: float = 2.0 + eos_coefficient: float = 1e-4 + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="rt_detr_resnet", - default_config_kwargs={ - "out_indices": [2, 3, 4], - }, + default_config_kwargs={"out_indices": [2, 3, 4]}, **kwargs, ) - - self.backbone_config = backbone_config - self.freeze_backbone_batch_norms = freeze_backbone_batch_norms - # encoder - self.encoder_hidden_dim = encoder_hidden_dim - self.encoder_in_channels = encoder_in_channels - self.feat_strides = feat_strides - self.encoder_attention_heads = encoder_attention_heads - self.encoder_ffn_dim = encoder_ffn_dim - self.dropout = dropout - self.activation_dropout = activation_dropout - self.encode_proj_layers = encode_proj_layers - self.encoder_layers = encoder_layers - self.positional_encoding_temperature = positional_encoding_temperature - self.eval_size = eval_size - self.normalize_before = normalize_before - self.encoder_activation_function = encoder_activation_function - self.activation_function = activation_function - self.hidden_expansion = hidden_expansion - # decoder - self.d_model = d_model - self.num_queries = num_queries - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_in_channels = decoder_in_channels - self.num_feature_levels = num_feature_levels - self.decoder_n_points = decoder_n_points - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.decoder_activation_function = decoder_activation_function - self.attention_dropout = attention_dropout - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.box_noise_scale = box_noise_scale - self.learn_initial_query = learn_initial_query - self.anchor_image_size = anchor_image_size - self.auxiliary_loss = auxiliary_loss - self.disable_custom_kernels = disable_custom_kernels - self.with_box_refine = with_box_refine - # Loss - self.matcher_alpha = matcher_alpha - self.matcher_gamma = matcher_gamma - self.matcher_class_cost = matcher_class_cost - self.matcher_bbox_cost = matcher_bbox_cost - self.matcher_giou_cost = matcher_giou_cost - self.use_focal_loss = use_focal_loss - self.focal_loss_alpha = focal_loss_alpha - self.focal_loss_gamma = focal_loss_gamma - self.weight_loss_vfl = weight_loss_vfl - self.weight_loss_bbox = weight_loss_bbox - self.weight_loss_giou = weight_loss_giou - self.eos_coefficient = eos_coefficient - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__post_init__(**kwargs) __all__ = ["RTDetrConfig"] diff --git a/src/transformers/models/rt_detr/configuration_rt_detr_resnet.py b/src/transformers/models/rt_detr/configuration_rt_detr_resnet.py index 3456302458c7..6c943f284556 100644 --- a/src/transformers/models/rt_detr/configuration_rt_detr_resnet.py +++ b/src/transformers/models/rt_detr/configuration_rt_detr_resnet.py @@ -13,15 +13,15 @@ # limitations under the License. """RT-DETR ResNet model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/resnet-50") +@strict(accept_kwargs=True) class RTDetrResNetConfig(BackboneConfigMixin, PreTrainedConfig): r""" depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 3]`): @@ -55,33 +55,29 @@ class RTDetrResNetConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "rt_detr_resnet" layer_types = ["basic", "bottleneck"] - def __init__( - self, - num_channels=3, - embedding_size=64, - hidden_sizes=[256, 512, 1024, 2048], - depths=[3, 4, 6, 3], - layer_type="bottleneck", - hidden_act="relu", - downsample_in_first_stage=False, - downsample_in_bottleneck=False, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) - if layer_type not in self.layer_types: - raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}") - self.num_channels = num_channels - self.embedding_size = embedding_size - self.hidden_sizes = hidden_sizes - self.depths = depths - self.layer_type = layer_type - self.hidden_act = hidden_act - self.downsample_in_first_stage = downsample_in_first_stage - self.downsample_in_bottleneck = downsample_in_bottleneck - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + num_channels: int = 3 + embedding_size: int = 64 + hidden_sizes: list[int] | tuple[int, ...] = (256, 512, 1024, 2048) + depths: list[int] | tuple[int, ...] = (3, 4, 6, 3) + layer_type: str = "bottleneck" + hidden_act: str = "relu" + downsample_in_first_stage: bool = False + downsample_in_bottleneck: bool = False + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + + def __post_init__(self, **kwargs): + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + self.hidden_sizes = list(self.hidden_sizes) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.layer_type not in self.layer_types: + raise ValueError(f"layer_type={self.layer_type} is not one of {','.join(self.layer_types)}") __all__ = ["RTDetrResNetConfig"] diff --git a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py index 5446381ee577..e3b0ade23f8f 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py @@ -377,7 +377,7 @@ def forward( >>> list(feature_maps[-1].shape) [1, 2048, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/rt_detr_v2/configuration_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/configuration_rt_detr_v2.py index 590d91b1aed0..2e02e87e73c2 100644 --- a/src/transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +++ b/src/transformers/models/rt_detr_v2/configuration_rt_detr_v2.py @@ -17,6 +17,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -24,6 +26,7 @@ @auto_docstring(checkpoint="PekingU/rtdetr_r18vd") +@strict(accept_kwargs=True) class RTDetrV2Config(PreTrainedConfig): r""" initializer_bias_prior_prob (`float`, *optional*): @@ -134,142 +137,70 @@ class RTDetrV2Config(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - initializer_range=0.01, - initializer_bias_prior_prob=None, - layer_norm_eps=1e-5, - batch_norm_eps=1e-5, - # backbone - backbone_config=None, - freeze_backbone_batch_norms=True, - # encoder HybridEncoder - encoder_hidden_dim=256, - encoder_in_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - encoder_layers=1, - encoder_ffn_dim=1024, - encoder_attention_heads=8, - dropout=0.0, - activation_dropout=0.0, - encode_proj_layers=[2], - positional_encoding_temperature=10000, - encoder_activation_function="gelu", - activation_function="silu", - eval_size=None, - normalize_before=False, - hidden_expansion=1.0, - # decoder RTDetrV2Transformer - d_model=256, - num_queries=300, - decoder_in_channels=[256, 256, 256], - decoder_ffn_dim=1024, - num_feature_levels=3, - decoder_n_points=4, - decoder_layers=6, - decoder_attention_heads=8, - decoder_activation_function="relu", - attention_dropout=0.0, - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0, - learn_initial_query=False, - anchor_image_size=None, - with_box_refine=True, - is_encoder_decoder=True, - # Loss - matcher_alpha=0.25, - matcher_gamma=2.0, - matcher_class_cost=2.0, - matcher_bbox_cost=5.0, - matcher_giou_cost=2.0, - use_focal_loss=True, - auxiliary_loss=True, - focal_loss_alpha=0.75, - focal_loss_gamma=2.0, - weight_loss_vfl=1.0, - weight_loss_bbox=5.0, - weight_loss_giou=2.0, - eos_coefficient=1e-4, - decoder_n_levels=3, # default value - decoder_offset_scale=0.5, # default value - decoder_method="default", - tie_word_embeddings=True, - **kwargs, - ): - self.initializer_range = initializer_range - self.initializer_bias_prior_prob = initializer_bias_prior_prob - self.layer_norm_eps = layer_norm_eps - self.batch_norm_eps = batch_norm_eps - - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + initializer_range: float = 0.01 + initializer_bias_prior_prob: float | None = None + layer_norm_eps: float = 1e-5 + batch_norm_eps: float = 1e-5 + backbone_config: dict | PreTrainedConfig | None = None + freeze_backbone_batch_norms: bool = True + encoder_hidden_dim: int = 256 + encoder_in_channels: list[int] | tuple[int, ...] = (512, 1024, 2048) + feat_strides: list[int] | tuple[int, ...] = (8, 16, 32) + encoder_layers: int = 1 + encoder_ffn_dim: int = 1024 + encoder_attention_heads: int = 8 + dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + encode_proj_layers: list[int] | tuple[int, ...] = (2,) + positional_encoding_temperature: int = 10000 + encoder_activation_function: str = "gelu" + activation_function: str = "silu" + eval_size: int | None = None + normalize_before: bool = False + hidden_expansion: float = 1.0 + d_model: int = 256 + num_queries: int = 300 + decoder_in_channels: list[int] | tuple[int, ...] = (256, 256, 256) + decoder_ffn_dim: int = 1024 + num_feature_levels: int = 3 + decoder_n_points: int = 4 + decoder_layers: int = 6 + decoder_attention_heads: int = 8 + decoder_activation_function: str = "relu" + attention_dropout: float | int = 0.0 + num_denoising: int = 100 + label_noise_ratio: float = 0.5 + box_noise_scale: float = 1.0 + learn_initial_query: bool = False + anchor_image_size: int | list[int] | None = None + with_box_refine: bool = True + is_encoder_decoder: bool = True + matcher_alpha: float = 0.25 + matcher_gamma: float = 2.0 + matcher_class_cost: float = 2.0 + matcher_bbox_cost: float = 5.0 + matcher_giou_cost: float = 2.0 + use_focal_loss: bool = True + auxiliary_loss: bool = True + focal_loss_alpha: float = 0.75 + focal_loss_gamma: float = 2.0 + weight_loss_vfl: float = 1.0 + weight_loss_bbox: float = 5.0 + weight_loss_giou: float = 2.0 + eos_coefficient: float = 1e-4 + decoder_n_levels: int = 3 + decoder_offset_scale: float = 0.5 + decoder_method: str = "default" + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="rt_detr_resnet", - default_config_kwargs={ - "out_indices": [2, 3, 4], - }, + default_config_kwargs={"out_indices": [2, 3, 4]}, **kwargs, ) - - self.backbone_config = backbone_config - self.freeze_backbone_batch_norms = freeze_backbone_batch_norms - # encoder - self.encoder_hidden_dim = encoder_hidden_dim - self.encoder_in_channels = encoder_in_channels - self.feat_strides = feat_strides - self.encoder_ffn_dim = encoder_ffn_dim - self.dropout = dropout - self.activation_dropout = activation_dropout - self.encode_proj_layers = encode_proj_layers - self.encoder_layers = encoder_layers - self.positional_encoding_temperature = positional_encoding_temperature - self.eval_size = eval_size - self.normalize_before = normalize_before - self.encoder_activation_function = encoder_activation_function - self.activation_function = activation_function - self.hidden_expansion = hidden_expansion - self.num_queries = num_queries - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_in_channels = decoder_in_channels - self.num_feature_levels = num_feature_levels - self.decoder_n_points = decoder_n_points - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.decoder_activation_function = decoder_activation_function - self.attention_dropout = attention_dropout - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.box_noise_scale = box_noise_scale - self.learn_initial_query = learn_initial_query - self.anchor_image_size = anchor_image_size - self.auxiliary_loss = auxiliary_loss - self.with_box_refine = with_box_refine - # Loss - self.matcher_alpha = matcher_alpha - self.matcher_gamma = matcher_gamma - self.matcher_class_cost = matcher_class_cost - self.matcher_bbox_cost = matcher_bbox_cost - self.matcher_giou_cost = matcher_giou_cost - self.use_focal_loss = use_focal_loss - self.focal_loss_alpha = focal_loss_alpha - self.focal_loss_gamma = focal_loss_gamma - self.weight_loss_vfl = weight_loss_vfl - self.weight_loss_bbox = weight_loss_bbox - self.weight_loss_giou = weight_loss_giou - self.eos_coefficient = eos_coefficient - - if not hasattr(self, "d_model"): - self.d_model = d_model - - if not hasattr(self, "encoder_attention_heads"): - self.encoder_attention_heads = encoder_attention_heads - # add the new attributes with the given values or defaults - self.decoder_n_levels = decoder_n_levels - self.decoder_offset_scale = decoder_offset_scale - self.decoder_method = decoder_method - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__post_init__(**kwargs) __all__ = ["RTDetrV2Config"] diff --git a/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py index a09e5c0ff0b1..1bd1490d8359 100644 --- a/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py +++ b/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py @@ -16,6 +16,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import Tensor from ... import initialization as init @@ -38,6 +39,7 @@ @auto_docstring(checkpoint="PekingU/rtdetr_r18vd") +@strict(accept_kwargs=True) class RTDetrV2Config(PreTrainedConfig): r""" initializer_bias_prior_prob (`float`, *optional*): @@ -148,142 +150,70 @@ class RTDetrV2Config(PreTrainedConfig): "num_attention_heads": "encoder_attention_heads", } - def __init__( - self, - initializer_range=0.01, - initializer_bias_prior_prob=None, - layer_norm_eps=1e-5, - batch_norm_eps=1e-5, - # backbone - backbone_config=None, - freeze_backbone_batch_norms=True, - # encoder HybridEncoder - encoder_hidden_dim=256, - encoder_in_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - encoder_layers=1, - encoder_ffn_dim=1024, - encoder_attention_heads=8, - dropout=0.0, - activation_dropout=0.0, - encode_proj_layers=[2], - positional_encoding_temperature=10000, - encoder_activation_function="gelu", - activation_function="silu", - eval_size=None, - normalize_before=False, - hidden_expansion=1.0, - # decoder RTDetrV2Transformer - d_model=256, - num_queries=300, - decoder_in_channels=[256, 256, 256], - decoder_ffn_dim=1024, - num_feature_levels=3, - decoder_n_points=4, - decoder_layers=6, - decoder_attention_heads=8, - decoder_activation_function="relu", - attention_dropout=0.0, - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0, - learn_initial_query=False, - anchor_image_size=None, - with_box_refine=True, - is_encoder_decoder=True, - # Loss - matcher_alpha=0.25, - matcher_gamma=2.0, - matcher_class_cost=2.0, - matcher_bbox_cost=5.0, - matcher_giou_cost=2.0, - use_focal_loss=True, - auxiliary_loss=True, - focal_loss_alpha=0.75, - focal_loss_gamma=2.0, - weight_loss_vfl=1.0, - weight_loss_bbox=5.0, - weight_loss_giou=2.0, - eos_coefficient=1e-4, - decoder_n_levels=3, # default value - decoder_offset_scale=0.5, # default value - decoder_method="default", - tie_word_embeddings=True, - **kwargs, - ): - self.initializer_range = initializer_range - self.initializer_bias_prior_prob = initializer_bias_prior_prob - self.layer_norm_eps = layer_norm_eps - self.batch_norm_eps = batch_norm_eps - - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + initializer_range: float = 0.01 + initializer_bias_prior_prob: float | None = None + layer_norm_eps: float = 1e-5 + batch_norm_eps: float = 1e-5 + backbone_config: dict | PreTrainedConfig | None = None + freeze_backbone_batch_norms: bool = True + encoder_hidden_dim: int = 256 + encoder_in_channels: list[int] | tuple[int, ...] = (512, 1024, 2048) + feat_strides: list[int] | tuple[int, ...] = (8, 16, 32) + encoder_layers: int = 1 + encoder_ffn_dim: int = 1024 + encoder_attention_heads: int = 8 + dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + encode_proj_layers: list[int] | tuple[int, ...] = (2,) + positional_encoding_temperature: int = 10000 + encoder_activation_function: str = "gelu" + activation_function: str = "silu" + eval_size: int | None = None + normalize_before: bool = False + hidden_expansion: float = 1.0 + d_model: int = 256 + num_queries: int = 300 + decoder_in_channels: list[int] | tuple[int, ...] = (256, 256, 256) + decoder_ffn_dim: int = 1024 + num_feature_levels: int = 3 + decoder_n_points: int = 4 + decoder_layers: int = 6 + decoder_attention_heads: int = 8 + decoder_activation_function: str = "relu" + attention_dropout: float | int = 0.0 + num_denoising: int = 100 + label_noise_ratio: float = 0.5 + box_noise_scale: float = 1.0 + learn_initial_query: bool = False + anchor_image_size: int | list[int] | None = None + with_box_refine: bool = True + is_encoder_decoder: bool = True + matcher_alpha: float = 0.25 + matcher_gamma: float = 2.0 + matcher_class_cost: float = 2.0 + matcher_bbox_cost: float = 5.0 + matcher_giou_cost: float = 2.0 + use_focal_loss: bool = True + auxiliary_loss: bool = True + focal_loss_alpha: float = 0.75 + focal_loss_gamma: float = 2.0 + weight_loss_vfl: float = 1.0 + weight_loss_bbox: float = 5.0 + weight_loss_giou: float = 2.0 + eos_coefficient: float = 1e-4 + decoder_n_levels: int = 3 + decoder_offset_scale: float = 0.5 + decoder_method: str = "default" + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="rt_detr_resnet", - default_config_kwargs={ - "out_indices": [2, 3, 4], - }, + default_config_kwargs={"out_indices": [2, 3, 4]}, **kwargs, ) - - self.backbone_config = backbone_config - self.freeze_backbone_batch_norms = freeze_backbone_batch_norms - # encoder - self.encoder_hidden_dim = encoder_hidden_dim - self.encoder_in_channels = encoder_in_channels - self.feat_strides = feat_strides - self.encoder_ffn_dim = encoder_ffn_dim - self.dropout = dropout - self.activation_dropout = activation_dropout - self.encode_proj_layers = encode_proj_layers - self.encoder_layers = encoder_layers - self.positional_encoding_temperature = positional_encoding_temperature - self.eval_size = eval_size - self.normalize_before = normalize_before - self.encoder_activation_function = encoder_activation_function - self.activation_function = activation_function - self.hidden_expansion = hidden_expansion - self.num_queries = num_queries - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_in_channels = decoder_in_channels - self.num_feature_levels = num_feature_levels - self.decoder_n_points = decoder_n_points - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.decoder_activation_function = decoder_activation_function - self.attention_dropout = attention_dropout - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.box_noise_scale = box_noise_scale - self.learn_initial_query = learn_initial_query - self.anchor_image_size = anchor_image_size - self.auxiliary_loss = auxiliary_loss - self.with_box_refine = with_box_refine - # Loss - self.matcher_alpha = matcher_alpha - self.matcher_gamma = matcher_gamma - self.matcher_class_cost = matcher_class_cost - self.matcher_bbox_cost = matcher_bbox_cost - self.matcher_giou_cost = matcher_giou_cost - self.use_focal_loss = use_focal_loss - self.focal_loss_alpha = focal_loss_alpha - self.focal_loss_gamma = focal_loss_gamma - self.weight_loss_vfl = weight_loss_vfl - self.weight_loss_bbox = weight_loss_bbox - self.weight_loss_giou = weight_loss_giou - self.eos_coefficient = eos_coefficient - - if not hasattr(self, "d_model"): - self.d_model = d_model - - if not hasattr(self, "encoder_attention_heads"): - self.encoder_attention_heads = encoder_attention_heads - # add the new attributes with the given values or defaults - self.decoder_n_levels = decoder_n_levels - self.decoder_offset_scale = decoder_offset_scale - self.decoder_method = decoder_method - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__post_init__(**kwargs) def multi_scale_deformable_attention_v2( diff --git a/src/transformers/models/rwkv/configuration_rwkv.py b/src/transformers/models/rwkv/configuration_rwkv.py index 87de411188ac..56e325be105e 100644 --- a/src/transformers/models/rwkv/configuration_rwkv.py +++ b/src/transformers/models/rwkv/configuration_rwkv.py @@ -14,14 +14,14 @@ # limitations under the License. """RWKV configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="RWKV/rwkv-4-169m-pile") +@strict(accept_kwargs=True) class RwkvConfig(PreTrainedConfig): """ context_length (`int`, *optional*, defaults to 1024): @@ -51,37 +51,26 @@ class RwkvConfig(PreTrainedConfig): model_type = "rwkv" attribute_map = {"max_position_embeddings": "context_length"} - def __init__( - self, - vocab_size=50277, - context_length=1024, - hidden_size=4096, - num_hidden_layers=32, - attention_hidden_size=None, - intermediate_size=None, - layer_norm_epsilon=1e-5, - bos_token_id=0, - eos_token_id=0, - rescale_every=6, - tie_word_embeddings=False, - use_cache=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.context_length = context_length - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size - self.intermediate_size = intermediate_size if intermediate_size is not None else 4 * hidden_size - self.layer_norm_epsilon = layer_norm_epsilon - self.rescale_every = rescale_every - self.use_cache = use_cache - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + vocab_size: int = 50277 + context_length: int = 1024 + hidden_size: int = 4096 + num_hidden_layers: int = 32 + attention_hidden_size: int | None = None + intermediate_size: int | None = None + layer_norm_epsilon: float = 1e-5 + bos_token_id: int | None = 0 + eos_token_id: int | None = 0 + rescale_every: int = 6 + tie_word_embeddings: bool = False + use_cache: bool = True + + def __post_init__(self, **kwargs): + self.attention_hidden_size = ( + self.attention_hidden_size if self.attention_hidden_size is not None else self.hidden_size + ) + self.intermediate_size = self.intermediate_size if self.intermediate_size is not None else 4 * self.hidden_size + + super().__post_init__(**kwargs) __all__ = ["RwkvConfig"] diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py index 1234757a9102..5c7f008fc28e 100644 --- a/src/transformers/models/rwkv/modeling_rwkv.py +++ b/src/transformers/models/rwkv/modeling_rwkv.py @@ -544,7 +544,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if attention_mask is not None: logger.warning_once("`attention_mask` was passed, but it is unused in this model.") @@ -721,7 +721,7 @@ def forward( use_cache (`bool`, *optional*): If set to `True`, the last state is returned and can be used to quickly generate the next logits. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict rwkv_outputs = self.rwkv( input_ids, diff --git a/src/transformers/models/sam/configuration_sam.py b/src/transformers/models/sam/configuration_sam.py index cbe1f3842e67..8fd3d02c8afe 100644 --- a/src/transformers/models/sam/configuration_sam.py +++ b/src/transformers/models/sam/configuration_sam.py @@ -13,14 +13,14 @@ # limitations under the License. """SAM model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/sam-vit-huge") +@strict(accept_kwargs=True) class SamPromptEncoderConfig(PreTrainedConfig): r""" mask_input_channels (`int`, *optional*, defaults to 16): @@ -31,29 +31,21 @@ class SamPromptEncoderConfig(PreTrainedConfig): base_config_key = "prompt_encoder_config" - def __init__( - self, - hidden_size=256, - image_size=1024, - patch_size=16, - mask_input_channels=16, - num_point_embeddings=4, - hidden_act="gelu", - layer_norm_eps=1e-6, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.image_size = image_size - self.patch_size = patch_size - self.image_embedding_size = image_size // patch_size - self.mask_input_channels = mask_input_channels - self.num_point_embeddings = num_point_embeddings - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps + hidden_size: int = 256 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + mask_input_channels: int = 16 + num_point_embeddings: int = 4 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + + def __post_init__(self, **kwargs): + self.image_embedding_size = self.image_size // self.patch_size + super().__post_init__(**kwargs) @auto_docstring(checkpoint="facebook/sam-vit-huge") +@strict(accept_kwargs=True) class SamMaskDecoderConfig(PreTrainedConfig): r""" mlp_dim (`int`, *optional*, defaults to 2048): @@ -70,34 +62,20 @@ class SamMaskDecoderConfig(PreTrainedConfig): base_config_key = "mask_decoder_config" - def __init__( - self, - hidden_size=256, - hidden_act="relu", - mlp_dim=2048, - num_hidden_layers=2, - num_attention_heads=8, - attention_downsample_rate=2, - num_multimask_outputs=3, - iou_head_depth=3, - iou_head_hidden_dim=256, - layer_norm_eps=1e-6, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.mlp_dim = mlp_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.attention_downsample_rate = attention_downsample_rate - self.num_multimask_outputs = num_multimask_outputs - self.iou_head_depth = iou_head_depth - self.iou_head_hidden_dim = iou_head_hidden_dim - self.layer_norm_eps = layer_norm_eps + hidden_size: int = 256 + hidden_act: str = "relu" + mlp_dim: int = 2048 + num_hidden_layers: int = 2 + num_attention_heads: int = 8 + attention_downsample_rate: int = 2 + num_multimask_outputs: int = 3 + iou_head_depth: int = 3 + iou_head_hidden_dim: int = 256 + layer_norm_eps: float = 1e-6 @auto_docstring(checkpoint="facebook/sam-vit-huge") +@strict(accept_kwargs=True) class SamVisionConfig(PreTrainedConfig): r""" output_channels (`int`, *optional*, defaults to 256): @@ -135,54 +113,34 @@ class SamVisionConfig(PreTrainedConfig): base_config_key = "vision_config" model_type = "sam_vision_model" - def __init__( - self, - hidden_size=768, - output_channels=256, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=1024, - patch_size=16, - hidden_act="gelu", - layer_norm_eps=1e-06, - attention_dropout=0.0, - initializer_range=1e-10, - qkv_bias=True, - mlp_ratio=4.0, - use_abs_pos=True, - use_rel_pos=True, - window_size=14, - global_attn_indexes=[2, 5, 8, 11], - num_pos_feats=128, - mlp_dim=None, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.output_channels = output_channels - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.image_size = image_size - self.patch_size = patch_size - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.qkv_bias = qkv_bias - self.mlp_ratio = mlp_ratio - self.use_abs_pos = use_abs_pos - self.use_rel_pos = use_rel_pos - self.window_size = window_size - self.global_attn_indexes = global_attn_indexes - self.num_pos_feats = num_pos_feats - self.mlp_dim = int(hidden_size * mlp_ratio) if mlp_dim is None else mlp_dim + hidden_size: int = 768 + output_channels: int = 256 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-06 + attention_dropout: float | int = 0.0 + initializer_range: float = 1e-10 + qkv_bias: bool = True + mlp_ratio: float = 4.0 + use_abs_pos: bool = True + use_rel_pos: bool = True + window_size: int = 14 + global_attn_indexes: list[int] | tuple[int, ...] = (2, 5, 8, 11) + num_pos_feats: int = 128 + mlp_dim: int | None = None + + def __post_init__(self, **kwargs): + self.mlp_dim = int(self.hidden_size * self.mlp_ratio) if self.mlp_dim is None else self.mlp_dim self.scale = self.hidden_size // 2 + super().__post_init__(**kwargs) @auto_docstring(checkpoint="facebook/sam-vit-huge") +@strict(accept_kwargs=True) class SamConfig(PreTrainedConfig): r""" prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`], *optional*): @@ -226,32 +184,29 @@ class SamConfig(PreTrainedConfig): "vision_config": SamVisionConfig, } - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - tie_word_embeddings=True, - **kwargs, - ): - vision_config = vision_config if vision_config is not None else {} - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - - if isinstance(vision_config, SamVisionConfig): - vision_config = vision_config.to_dict() - if isinstance(prompt_encoder_config, SamPromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, SamMaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() - - self.vision_config = SamVisionConfig(**vision_config) - self.prompt_encoder_config = SamPromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = SamMaskDecoderConfig(**mask_decoder_config) - self.initializer_range = initializer_range - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vision_config: dict | PreTrainedConfig | None = None + prompt_encoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = SamVisionConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = SamVisionConfig() + + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = SamPromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = SamPromptEncoderConfig() + + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = SamMaskDecoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = SamMaskDecoderConfig() + + super().__post_init__(**kwargs) __all__ = ["SamConfig", "SamMaskDecoderConfig", "SamPromptEncoderConfig", "SamVisionConfig"] diff --git a/src/transformers/models/sam2/configuration_sam2.py b/src/transformers/models/sam2/configuration_sam2.py index 16d1961b1e83..54f0be9d3f9b 100644 --- a/src/transformers/models/sam2/configuration_sam2.py +++ b/src/transformers/models/sam2/configuration_sam2.py @@ -13,15 +13,15 @@ # limitations under the License. """SAM2 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="facebook/sam2.1-hiera-tiny") +@strict(accept_kwargs=True) class Sam2HieraDetConfig(PreTrainedConfig): r""" patch_kernel_size (`list[int]`, *optional*, defaults to `[7, 7]`): @@ -51,71 +51,55 @@ class Sam2HieraDetConfig(PreTrainedConfig): base_config_key = "backbone_config" model_type = "sam2_hiera_det_model" - def __init__( - self, - hidden_size=96, - num_attention_heads=1, - num_channels=3, - image_size=None, - patch_kernel_size=None, - patch_stride=None, - patch_padding=None, - query_stride=None, - window_positional_embedding_background_size=None, - num_query_pool_stages=3, - blocks_per_stage=None, - embed_dim_per_stage=None, - num_attention_heads_per_stage=None, - window_size_per_stage=None, - global_attention_blocks=None, - mlp_ratio=4.0, - hidden_act="gelu", - layer_norm_eps=1e-6, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - image_size = image_size if image_size is not None else [1024, 1024] - patch_kernel_size = patch_kernel_size if patch_kernel_size is not None else [7, 7] - patch_stride = patch_stride if patch_stride is not None else [4, 4] - patch_padding = patch_padding if patch_padding is not None else [3, 3] - query_stride = query_stride if query_stride is not None else [2, 2] - window_positional_embedding_background_size = ( - window_positional_embedding_background_size - if window_positional_embedding_background_size is not None + hidden_size: int = 96 + num_attention_heads: int = 1 + num_channels: int = 3 + image_size: int | list[int] | None = None + patch_kernel_size: int | list[int] | None = None + patch_stride: int | list[int] | None = None + patch_padding: int | list[int] | None = None + query_stride: int | list[int] | None = None + window_positional_embedding_background_size: list[int] | None = None + num_query_pool_stages: int = 3 + blocks_per_stage: list[int] | None = None + embed_dim_per_stage: list[int] | None = None + num_attention_heads_per_stage: list[int] | None = None + window_size_per_stage: list[int] | None = None + global_attention_blocks: list[int] | None = None + mlp_ratio: float = 4.0 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + self.image_size = self.image_size if self.image_size is not None else [1024, 1024] + self.patch_kernel_size = self.patch_kernel_size if self.patch_kernel_size is not None else [7, 7] + self.patch_stride = self.patch_stride if self.patch_stride is not None else [4, 4] + self.patch_padding = self.patch_padding if self.patch_padding is not None else [3, 3] + self.query_stride = self.query_stride if self.query_stride is not None else [2, 2] + self.window_positional_embedding_background_size = ( + self.window_positional_embedding_background_size + if self.window_positional_embedding_background_size is not None else [7, 7] ) - blocks_per_stage = blocks_per_stage if blocks_per_stage is not None else [1, 2, 7, 2] - embed_dim_per_stage = embed_dim_per_stage if embed_dim_per_stage is not None else [96, 192, 384, 768] - num_attention_heads_per_stage = ( - num_attention_heads_per_stage if num_attention_heads_per_stage is not None else [1, 2, 4, 8] + self.blocks_per_stage = self.blocks_per_stage if self.blocks_per_stage is not None else [1, 2, 7, 2] + self.embed_dim_per_stage = ( + self.embed_dim_per_stage if self.embed_dim_per_stage is not None else [96, 192, 384, 768] + ) + self.num_attention_heads_per_stage = ( + self.num_attention_heads_per_stage if self.num_attention_heads_per_stage is not None else [1, 2, 4, 8] + ) + self.window_size_per_stage = ( + self.window_size_per_stage if self.window_size_per_stage is not None else [8, 4, 14, 7] ) - window_size_per_stage = window_size_per_stage if window_size_per_stage is not None else [8, 4, 14, 7] - global_attention_blocks = global_attention_blocks if global_attention_blocks is not None else [5, 7, 9] - - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.image_size = image_size - self.patch_kernel_size = patch_kernel_size - self.patch_stride = patch_stride - self.patch_padding = patch_padding - self.query_stride = query_stride - self.window_positional_embedding_background_size = window_positional_embedding_background_size - self.num_query_pool_stages = num_query_pool_stages - self.blocks_per_stage = blocks_per_stage - self.embed_dim_per_stage = embed_dim_per_stage - self.num_attention_heads_per_stage = num_attention_heads_per_stage - self.window_size_per_stage = window_size_per_stage - self.global_attention_blocks = global_attention_blocks - self.mlp_ratio = mlp_ratio - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range + self.global_attention_blocks = ( + self.global_attention_blocks if self.global_attention_blocks is not None else [5, 7, 9] + ) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="facebook/sam2.1-hiera-tiny") +@strict(accept_kwargs=True) class Sam2VisionConfig(PreTrainedConfig): r""" backbone_channel_list (`List[int]`, *optional*, defaults to `[768, 384, 192, 96]`): @@ -142,55 +126,39 @@ class Sam2VisionConfig(PreTrainedConfig): "backbone_config": AutoConfig, } - def __init__( - self, - backbone_config=None, - backbone_channel_list=None, - backbone_feature_sizes=None, - fpn_hidden_size=256, - fpn_kernel_size=1, - fpn_stride=1, - fpn_padding=0, - fpn_top_down_levels=None, - num_feature_levels=3, - hidden_act="gelu", - layer_norm_eps=1e-6, - initializer_range=0.02, - **kwargs, - ): - backbone_channel_list = [768, 384, 192, 96] if backbone_channel_list is None else backbone_channel_list - backbone_feature_sizes = ( - [[256, 256], [128, 128], [64, 64]] if backbone_feature_sizes is None else backbone_feature_sizes + backbone_config: dict | PreTrainedConfig | None = None + backbone_channel_list: list[int] | None = None + backbone_feature_sizes: list | None = None + fpn_hidden_size: int = 256 + fpn_kernel_size: int = 1 + fpn_stride: int = 1 + fpn_padding: int = 0 + fpn_top_down_levels: list[int] | None = None + num_feature_levels: int = 3 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + self.backbone_channel_list = ( + [768, 384, 192, 96] if self.backbone_channel_list is None else self.backbone_channel_list + ) + self.backbone_feature_sizes = ( + [[256, 256], [128, 128], [64, 64]] if self.backbone_feature_sizes is None else self.backbone_feature_sizes ) - fpn_top_down_levels = [2, 3] if fpn_top_down_levels is None else fpn_top_down_levels - - if isinstance(backbone_config, dict): - backbone_config["model_type"] = backbone_config.get("model_type", "sam2_hiera_det_model") - backbone_config = CONFIG_MAPPING[backbone_config["model_type"]](**backbone_config) - elif isinstance(backbone_config, Sam2HieraDetConfig): - pass - elif backbone_config is None: - backbone_config = Sam2HieraDetConfig() - - self.backbone_config = backbone_config - - # Neck - self.backbone_channel_list = backbone_channel_list - self.backbone_feature_sizes = backbone_feature_sizes - self.fpn_hidden_size = fpn_hidden_size - self.fpn_kernel_size = fpn_kernel_size - self.fpn_stride = fpn_stride - self.fpn_padding = fpn_padding - self.fpn_top_down_levels = fpn_top_down_levels - self.num_feature_levels = num_feature_levels - - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - super().__init__(**kwargs) + self.fpn_top_down_levels = [2, 3] if self.fpn_top_down_levels is None else self.fpn_top_down_levels + + if isinstance(self.backbone_config, dict): + self.backbone_config["model_type"] = self.backbone_config.get("model_type", "sam2_hiera_det_model") + self.backbone_config = CONFIG_MAPPING[self.backbone_config["model_type"]](**self.backbone_config) + elif self.backbone_config is None: + self.backbone_config = Sam2HieraDetConfig() + + super().__post_init__(**kwargs) @auto_docstring(checkpoint="facebook/sam2.1-hiera-tiny") +@strict(accept_kwargs=True) class Sam2PromptEncoderConfig(PreTrainedConfig): r""" mask_input_channels (`int`, *optional*, defaults to 16): @@ -203,30 +171,18 @@ class Sam2PromptEncoderConfig(PreTrainedConfig): base_config_key = "prompt_encoder_config" - def __init__( - self, - hidden_size=256, - image_size=1024, - patch_size=16, - mask_input_channels=16, - num_point_embeddings=4, - hidden_act="gelu", - layer_norm_eps=1e-6, - scale=1, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.image_size = image_size - self.patch_size = patch_size - self.mask_input_channels = mask_input_channels - self.num_point_embeddings = num_point_embeddings - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.scale = scale + hidden_size: int = 256 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + mask_input_channels: int = 16 + num_point_embeddings: int = 4 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + scale: int = 1 @auto_docstring(checkpoint="facebook/sam2.1-hiera-tiny") +@strict(accept_kwargs=True) class Sam2MaskDecoderConfig(PreTrainedConfig): r""" mlp_dim (`int`, *optional*, defaults to 2048): @@ -249,42 +205,22 @@ class Sam2MaskDecoderConfig(PreTrainedConfig): base_config_key = "mask_decoder_config" - def __init__( - self, - hidden_size=256, - hidden_act="gelu", - mlp_dim=2048, - num_hidden_layers=2, - num_attention_heads=8, - attention_downsample_rate=2, - num_multimask_outputs=3, - iou_head_depth=3, - iou_head_hidden_dim=256, - dynamic_multimask_via_stability=True, - dynamic_multimask_stability_delta=0.05, - dynamic_multimask_stability_thresh=0.98, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_multimask_outputs = num_multimask_outputs - self.hidden_act = hidden_act - self.iou_head_depth = iou_head_depth - self.iou_head_hidden_dim = iou_head_hidden_dim - self.dynamic_multimask_via_stability = dynamic_multimask_via_stability - self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta - self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh - - # TwoWayTransformer configuration - self.num_hidden_layers = num_hidden_layers - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.mlp_dim = mlp_dim - self.attention_downsample_rate = attention_downsample_rate + hidden_size: int = 256 + hidden_act: str = "gelu" + mlp_dim: int = 2048 + num_hidden_layers: int = 2 + num_attention_heads: int = 8 + attention_downsample_rate: int = 2 + num_multimask_outputs: int = 3 + iou_head_depth: int = 3 + iou_head_hidden_dim: int = 256 + dynamic_multimask_via_stability: bool = True + dynamic_multimask_stability_delta: float = 0.05 + dynamic_multimask_stability_thresh: float = 0.98 @auto_docstring(checkpoint="facebook/sam2.1-hiera-tiny") +@strict(accept_kwargs=True) class Sam2Config(PreTrainedConfig): r""" prompt_encoder_config (Union[`dict`, `Sam2PromptEncoderConfig`], *optional*): @@ -328,32 +264,29 @@ class Sam2Config(PreTrainedConfig): "mask_decoder_config": Sam2MaskDecoderConfig, } - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - **kwargs, - ): - vision_config = vision_config if vision_config is not None else {} - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "sam2_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - if isinstance(prompt_encoder_config, Sam2PromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, Sam2MaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() - - self.vision_config = vision_config - self.prompt_encoder_config = Sam2PromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = Sam2MaskDecoderConfig(**mask_decoder_config) - - self.initializer_range = initializer_range - super().__init__(**kwargs) + vision_config: dict | PreTrainedConfig | None = None + prompt_encoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "sam2_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["sam2_vision_model"]() + + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = Sam2PromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = Sam2PromptEncoderConfig() + + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = Sam2MaskDecoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = Sam2MaskDecoderConfig() + + super().__post_init__(**kwargs) __all__ = [ diff --git a/src/transformers/models/sam2_video/configuration_sam2_video.py b/src/transformers/models/sam2_video/configuration_sam2_video.py index 2fcca9f558d4..453af8d7cc27 100644 --- a/src/transformers/models/sam2_video/configuration_sam2_video.py +++ b/src/transformers/models/sam2_video/configuration_sam2_video.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="facebook/sam2_video.1-hiera-tiny") +@strict(accept_kwargs=True) class Sam2VideoPromptEncoderConfig(PreTrainedConfig): r""" mask_input_channels (`int`, *optional*, defaults to 16): @@ -35,30 +38,18 @@ class Sam2VideoPromptEncoderConfig(PreTrainedConfig): base_config_key = "prompt_encoder_config" - def __init__( - self, - hidden_size=256, - image_size=1024, - patch_size=16, - mask_input_channels=16, - num_point_embeddings=4, - hidden_act="gelu", - layer_norm_eps=1e-6, - scale=1, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.image_size = image_size - self.patch_size = patch_size - self.mask_input_channels = mask_input_channels - self.num_point_embeddings = num_point_embeddings - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.scale = scale + hidden_size: int = 256 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + mask_input_channels: int = 16 + num_point_embeddings: int = 4 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + scale: int = 1 @auto_docstring(checkpoint="facebook/sam2_video.1-hiera-tiny") +@strict(accept_kwargs=True) class Sam2VideoMaskDecoderConfig(PreTrainedConfig): r""" mlp_dim (`int`, *optional*, defaults to 2048): @@ -81,42 +72,22 @@ class Sam2VideoMaskDecoderConfig(PreTrainedConfig): base_config_key = "mask_decoder_config" - def __init__( - self, - hidden_size=256, - hidden_act="gelu", - mlp_dim=2048, - num_hidden_layers=2, - num_attention_heads=8, - attention_downsample_rate=2, - num_multimask_outputs=3, - iou_head_depth=3, - iou_head_hidden_dim=256, - dynamic_multimask_via_stability=True, - dynamic_multimask_stability_delta=0.05, - dynamic_multimask_stability_thresh=0.98, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_multimask_outputs = num_multimask_outputs - self.hidden_act = hidden_act - self.iou_head_depth = iou_head_depth - self.iou_head_hidden_dim = iou_head_hidden_dim - self.dynamic_multimask_via_stability = dynamic_multimask_via_stability - self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta - self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh - - # TwoWayTransformer configuration - self.num_hidden_layers = num_hidden_layers - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.mlp_dim = mlp_dim - self.attention_downsample_rate = attention_downsample_rate + hidden_size: int = 256 + hidden_act: str = "gelu" + mlp_dim: int = 2048 + num_hidden_layers: int = 2 + num_attention_heads: int = 8 + attention_downsample_rate: int = 2 + num_multimask_outputs: int = 3 + iou_head_depth: int = 3 + iou_head_hidden_dim: int = 256 + dynamic_multimask_via_stability: bool = True + dynamic_multimask_stability_delta: float = 0.05 + dynamic_multimask_stability_thresh: float = 0.98 @auto_docstring(checkpoint="facebook/sam2.1-hiera-tiny") +@strict(accept_kwargs=True) class Sam2VideoConfig(PreTrainedConfig): r""" prompt_encoder_config (Union[`dict`, `Sam2PromptEncoderConfig`], *optional*): @@ -234,117 +205,70 @@ class Sam2VideoConfig(PreTrainedConfig): "mask_decoder_config": Sam2VideoMaskDecoderConfig, } - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - num_maskmem=7, - image_size=1024, - sigmoid_scale_for_mem_enc=20.0, - sigmoid_bias_for_mem_enc=-10.0, - enable_occlusion_spatial_embedding=True, - multimask_output_in_sam=True, - multimask_min_pt_num=0, - multimask_max_pt_num=1, - multimask_output_for_tracking=True, - max_object_pointers_in_encoder=16, - max_cond_frame_num=-1, - enable_temporal_pos_encoding_for_object_pointers=True, - # memory attention - memory_attention_hidden_size=256, - memory_attention_num_layers=4, - memory_attention_num_attention_heads=1, - memory_attention_downsample_rate=1, - memory_attention_feed_forward_hidden_size=2048, - memory_attention_feed_forward_hidden_act="relu", - memory_attention_dropout=0.1, - memory_attention_rope_theta=10000, - memory_attention_rope_feat_sizes=None, - memory_attention_rope_dropout=0.1, - # memory encoder - memory_encoder_hidden_size=256, - memory_encoder_output_channels=64, - mask_downsampler_embed_dim=256, - mask_downsampler_kernel_size=3, - mask_downsampler_stride=2, - mask_downsampler_padding=1, - mask_downsampler_total_stride=16, - mask_downsampler_hidden_act="gelu", - memory_fuser_num_layers=2, - memory_fuser_embed_dim=256, - memory_fuser_intermediate_dim=1024, - memory_fuser_kernel_size=7, - memory_fuser_padding=3, - memory_fuser_layer_scale_init_value=1e-6, - memory_fuser_hidden_act="gelu", - **kwargs, - ): - super().__init__(**kwargs) + vision_config: dict | PreTrainedConfig | None = None + prompt_encoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + num_maskmem: int = 7 + image_size: int | list[int] | tuple[int, int] = 1024 + sigmoid_scale_for_mem_enc: float = 20.0 + sigmoid_bias_for_mem_enc: float = -10.0 + enable_occlusion_spatial_embedding: bool = True + multimask_output_in_sam: bool = True + multimask_min_pt_num: int = 0 + multimask_max_pt_num: int = 1 + multimask_output_for_tracking: bool = True + max_object_pointers_in_encoder: int = 16 + max_cond_frame_num: int = -1 + enable_temporal_pos_encoding_for_object_pointers: bool = True + memory_attention_hidden_size: int = 256 + memory_attention_num_layers: int = 4 + memory_attention_num_attention_heads: int = 1 + memory_attention_downsample_rate: int = 1 + memory_attention_feed_forward_hidden_size: int = 2048 + memory_attention_feed_forward_hidden_act: str = "relu" + memory_attention_dropout: float | int = 0.1 + memory_attention_rope_theta: int = 10000 + memory_attention_rope_feat_sizes: list[int] | None = None + memory_attention_rope_dropout: float | int = 0.1 + memory_encoder_hidden_size: int = 256 + memory_encoder_output_channels: int = 64 + mask_downsampler_embed_dim: int = 256 + mask_downsampler_kernel_size: int = 3 + mask_downsampler_stride: int = 2 + mask_downsampler_padding: int = 1 + mask_downsampler_total_stride: int = 16 + mask_downsampler_hidden_act: str = "gelu" + memory_fuser_num_layers: int = 2 + memory_fuser_embed_dim: int = 256 + memory_fuser_intermediate_dim: int = 1024 + memory_fuser_kernel_size: int = 7 + memory_fuser_padding: int = 3 + memory_fuser_layer_scale_init_value: float = 1e-6 + memory_fuser_hidden_act: str = "gelu" - vision_config = vision_config if vision_config is not None else {} - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - memory_attention_rope_feat_sizes = ( - [64, 64] if memory_attention_rope_feat_sizes is None else memory_attention_rope_feat_sizes + def __post_init__(self, **kwargs): + self.memory_attention_rope_feat_sizes = ( + [64, 64] if self.memory_attention_rope_feat_sizes is None else self.memory_attention_rope_feat_sizes ) - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "sam2_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - if isinstance(prompt_encoder_config, Sam2VideoPromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, Sam2VideoMaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() - - self.vision_config = vision_config - self.prompt_encoder_config = Sam2VideoPromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = Sam2VideoMaskDecoderConfig(**mask_decoder_config) + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "sam2_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["sam2_vision_model"]() - self.initializer_range = initializer_range - self.num_maskmem = num_maskmem # default 1 input frame + 6 previous frames - self.image_size = image_size - self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc - self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc - self.multimask_output_in_sam = multimask_output_in_sam - self.multimask_min_pt_num = multimask_min_pt_num - self.multimask_max_pt_num = multimask_max_pt_num - self.multimask_output_for_tracking = multimask_output_for_tracking - self.max_object_pointers_in_encoder = max_object_pointers_in_encoder - self.max_cond_frame_num = max_cond_frame_num - # The next 4 are True for sam2.1 and False for sam2 - self.enable_occlusion_spatial_embedding = enable_occlusion_spatial_embedding - self.enable_temporal_pos_encoding_for_object_pointers = enable_temporal_pos_encoding_for_object_pointers + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = Sam2VideoPromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = Sam2VideoPromptEncoderConfig() - # memory attention - self.memory_attention_hidden_size = memory_attention_hidden_size - self.memory_attention_num_layers = memory_attention_num_layers - self.memory_attention_num_attention_heads = memory_attention_num_attention_heads - self.memory_attention_downsample_rate = memory_attention_downsample_rate - self.memory_attention_feed_forward_hidden_size = memory_attention_feed_forward_hidden_size - self.memory_attention_feed_forward_hidden_act = memory_attention_feed_forward_hidden_act - self.memory_attention_dropout = memory_attention_dropout - self.memory_attention_rope_theta = memory_attention_rope_theta - self.memory_attention_rope_feat_sizes = memory_attention_rope_feat_sizes - self.memory_attention_rope_dropout = memory_attention_rope_dropout + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = Sam2VideoPromptEncoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = Sam2VideoMaskDecoderConfig() - # memory encoder - self.memory_encoder_hidden_size = memory_encoder_hidden_size - self.memory_encoder_output_channels = memory_encoder_output_channels - self.mask_downsampler_embed_dim = mask_downsampler_embed_dim - self.mask_downsampler_kernel_size = mask_downsampler_kernel_size - self.mask_downsampler_stride = mask_downsampler_stride - self.mask_downsampler_padding = mask_downsampler_padding - self.mask_downsampler_total_stride = mask_downsampler_total_stride - self.mask_downsampler_hidden_act = mask_downsampler_hidden_act - self.memory_fuser_num_layers = memory_fuser_num_layers - self.memory_fuser_embed_dim = memory_fuser_embed_dim - self.memory_fuser_intermediate_dim = memory_fuser_intermediate_dim - self.memory_fuser_kernel_size = memory_fuser_kernel_size - self.memory_fuser_padding = memory_fuser_padding - self.memory_fuser_layer_scale_init_value = memory_fuser_layer_scale_init_value - self.memory_fuser_hidden_act = memory_fuser_hidden_act + super().__post_init__(**kwargs) __all__ = ["Sam2VideoMaskDecoderConfig", "Sam2VideoPromptEncoderConfig", "Sam2VideoConfig"] diff --git a/src/transformers/models/sam2_video/modular_sam2_video.py b/src/transformers/models/sam2_video/modular_sam2_video.py index 68389853c49e..548e02f67435 100644 --- a/src/transformers/models/sam2_video/modular_sam2_video.py +++ b/src/transformers/models/sam2_video/modular_sam2_video.py @@ -23,6 +23,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch import Tensor from tqdm import tqdm @@ -71,6 +72,7 @@ class Sam2VideoMaskDecoderConfig(Sam2MaskDecoderConfig): @auto_docstring(checkpoint="facebook/sam2.1-hiera-tiny") +@strict(accept_kwargs=True) class Sam2VideoConfig(PreTrainedConfig): r""" prompt_encoder_config (Union[`dict`, `Sam2PromptEncoderConfig`], *optional*): @@ -188,117 +190,70 @@ class Sam2VideoConfig(PreTrainedConfig): "mask_decoder_config": Sam2VideoMaskDecoderConfig, } - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - num_maskmem=7, - image_size=1024, - sigmoid_scale_for_mem_enc=20.0, - sigmoid_bias_for_mem_enc=-10.0, - enable_occlusion_spatial_embedding=True, - multimask_output_in_sam=True, - multimask_min_pt_num=0, - multimask_max_pt_num=1, - multimask_output_for_tracking=True, - max_object_pointers_in_encoder=16, - max_cond_frame_num=-1, - enable_temporal_pos_encoding_for_object_pointers=True, - # memory attention - memory_attention_hidden_size=256, - memory_attention_num_layers=4, - memory_attention_num_attention_heads=1, - memory_attention_downsample_rate=1, - memory_attention_feed_forward_hidden_size=2048, - memory_attention_feed_forward_hidden_act="relu", - memory_attention_dropout=0.1, - memory_attention_rope_theta=10000, - memory_attention_rope_feat_sizes=None, - memory_attention_rope_dropout=0.1, - # memory encoder - memory_encoder_hidden_size=256, - memory_encoder_output_channels=64, - mask_downsampler_embed_dim=256, - mask_downsampler_kernel_size=3, - mask_downsampler_stride=2, - mask_downsampler_padding=1, - mask_downsampler_total_stride=16, - mask_downsampler_hidden_act="gelu", - memory_fuser_num_layers=2, - memory_fuser_embed_dim=256, - memory_fuser_intermediate_dim=1024, - memory_fuser_kernel_size=7, - memory_fuser_padding=3, - memory_fuser_layer_scale_init_value=1e-6, - memory_fuser_hidden_act="gelu", - **kwargs, - ): - super().__init__(**kwargs) - - vision_config = vision_config if vision_config is not None else {} - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - memory_attention_rope_feat_sizes = ( - [64, 64] if memory_attention_rope_feat_sizes is None else memory_attention_rope_feat_sizes + vision_config: dict | PreTrainedConfig | None = None + prompt_encoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + num_maskmem: int = 7 + image_size: int | list[int] | tuple[int, int] = 1024 + sigmoid_scale_for_mem_enc: float = 20.0 + sigmoid_bias_for_mem_enc: float = -10.0 + enable_occlusion_spatial_embedding: bool = True + multimask_output_in_sam: bool = True + multimask_min_pt_num: int = 0 + multimask_max_pt_num: int = 1 + multimask_output_for_tracking: bool = True + max_object_pointers_in_encoder: int = 16 + max_cond_frame_num: int = -1 + enable_temporal_pos_encoding_for_object_pointers: bool = True + memory_attention_hidden_size: int = 256 + memory_attention_num_layers: int = 4 + memory_attention_num_attention_heads: int = 1 + memory_attention_downsample_rate: int = 1 + memory_attention_feed_forward_hidden_size: int = 2048 + memory_attention_feed_forward_hidden_act: str = "relu" + memory_attention_dropout: float | int = 0.1 + memory_attention_rope_theta: int = 10000 + memory_attention_rope_feat_sizes: list[int] | None = None + memory_attention_rope_dropout: float | int = 0.1 + memory_encoder_hidden_size: int = 256 + memory_encoder_output_channels: int = 64 + mask_downsampler_embed_dim: int = 256 + mask_downsampler_kernel_size: int = 3 + mask_downsampler_stride: int = 2 + mask_downsampler_padding: int = 1 + mask_downsampler_total_stride: int = 16 + mask_downsampler_hidden_act: str = "gelu" + memory_fuser_num_layers: int = 2 + memory_fuser_embed_dim: int = 256 + memory_fuser_intermediate_dim: int = 1024 + memory_fuser_kernel_size: int = 7 + memory_fuser_padding: int = 3 + memory_fuser_layer_scale_init_value: float = 1e-6 + memory_fuser_hidden_act: str = "gelu" + + def __post_init__(self, **kwargs): + self.memory_attention_rope_feat_sizes = ( + [64, 64] if self.memory_attention_rope_feat_sizes is None else self.memory_attention_rope_feat_sizes ) - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "sam2_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - if isinstance(prompt_encoder_config, Sam2VideoPromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, Sam2VideoMaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() - - self.vision_config = vision_config - self.prompt_encoder_config = Sam2VideoPromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = Sam2VideoMaskDecoderConfig(**mask_decoder_config) - - self.initializer_range = initializer_range - self.num_maskmem = num_maskmem # default 1 input frame + 6 previous frames - self.image_size = image_size - self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc - self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc - self.multimask_output_in_sam = multimask_output_in_sam - self.multimask_min_pt_num = multimask_min_pt_num - self.multimask_max_pt_num = multimask_max_pt_num - self.multimask_output_for_tracking = multimask_output_for_tracking - self.max_object_pointers_in_encoder = max_object_pointers_in_encoder - self.max_cond_frame_num = max_cond_frame_num - # The next 4 are True for sam2.1 and False for sam2 - self.enable_occlusion_spatial_embedding = enable_occlusion_spatial_embedding - self.enable_temporal_pos_encoding_for_object_pointers = enable_temporal_pos_encoding_for_object_pointers - - # memory attention - self.memory_attention_hidden_size = memory_attention_hidden_size - self.memory_attention_num_layers = memory_attention_num_layers - self.memory_attention_num_attention_heads = memory_attention_num_attention_heads - self.memory_attention_downsample_rate = memory_attention_downsample_rate - self.memory_attention_feed_forward_hidden_size = memory_attention_feed_forward_hidden_size - self.memory_attention_feed_forward_hidden_act = memory_attention_feed_forward_hidden_act - self.memory_attention_dropout = memory_attention_dropout - self.memory_attention_rope_theta = memory_attention_rope_theta - self.memory_attention_rope_feat_sizes = memory_attention_rope_feat_sizes - self.memory_attention_rope_dropout = memory_attention_rope_dropout - - # memory encoder - self.memory_encoder_hidden_size = memory_encoder_hidden_size - self.memory_encoder_output_channels = memory_encoder_output_channels - self.mask_downsampler_embed_dim = mask_downsampler_embed_dim - self.mask_downsampler_kernel_size = mask_downsampler_kernel_size - self.mask_downsampler_stride = mask_downsampler_stride - self.mask_downsampler_padding = mask_downsampler_padding - self.mask_downsampler_total_stride = mask_downsampler_total_stride - self.mask_downsampler_hidden_act = mask_downsampler_hidden_act - self.memory_fuser_num_layers = memory_fuser_num_layers - self.memory_fuser_embed_dim = memory_fuser_embed_dim - self.memory_fuser_intermediate_dim = memory_fuser_intermediate_dim - self.memory_fuser_kernel_size = memory_fuser_kernel_size - self.memory_fuser_padding = memory_fuser_padding - self.memory_fuser_layer_scale_init_value = memory_fuser_layer_scale_init_value - self.memory_fuser_hidden_act = memory_fuser_hidden_act + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "sam2_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["sam2_vision_model"]() + + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = Sam2VideoPromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = Sam2VideoPromptEncoderConfig() + + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = Sam2VideoPromptEncoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = Sam2VideoMaskDecoderConfig() + + super().__post_init__(**kwargs) class Sam2VideoInferenceCache: diff --git a/src/transformers/models/sam3/configuration_sam3.py b/src/transformers/models/sam3/configuration_sam3.py index 06984624a099..53b41eb7f4cf 100644 --- a/src/transformers/models/sam3/configuration_sam3.py +++ b/src/transformers/models/sam3/configuration_sam3.py @@ -13,6 +13,8 @@ # limitations under the License. """SAM3 model configuration""" +from huggingface_hub.dataclasses import strict + from transformers import CLIPTextConfig from ...configuration_utils import PreTrainedConfig @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3ViTConfig(PreTrainedConfig): r""" rope_theta (`float`, *optional*, defaults to 10000.0): @@ -38,51 +41,32 @@ class Sam3ViTConfig(PreTrainedConfig): base_config_key = "backbone_config" model_type = "sam3_vit_model" - def __init__( - self, - hidden_size=1024, - intermediate_size=4736, - num_hidden_layers=32, - num_attention_heads=16, - num_channels=3, - image_size=1008, - patch_size=14, - hidden_act="gelu", - layer_norm_eps=1e-6, - attention_dropout=0.0, - rope_theta=10000.0, - window_size=24, - global_attn_indexes=None, - layer_scale_init_value=None, - pretrain_image_size=336, - hidden_dropout=0.0, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - if global_attn_indexes is None: - global_attn_indexes = [7, 15, 23, 31] - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.image_size = image_size - self.patch_size = patch_size - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.attention_dropout = attention_dropout - self.rope_theta = rope_theta - self.window_size = window_size - self.global_attn_indexes = global_attn_indexes - self.layer_scale_init_value = layer_scale_init_value - self.pretrain_image_size = pretrain_image_size - self.hidden_dropout = hidden_dropout - self.initializer_range = initializer_range + hidden_size: int = 1024 + intermediate_size: int = 4736 + num_hidden_layers: int = 32 + num_attention_heads: int = 16 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 1008 + patch_size: int | list[int] | tuple[int, int] = 14 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + rope_theta: float = 10000.0 + window_size: int = 24 + global_attn_indexes: list[int] | None = None + layer_scale_init_value: float | None = None + pretrain_image_size: int | list[int] | tuple[int, int] = 336 + hidden_dropout: float | int = 0.0 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) + if self.global_attn_indexes is None: + self.global_attn_indexes = [7, 15, 23, 31] @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3VisionConfig(PreTrainedConfig): r""" fpn_hidden_size (`int`, *optional*, defaults to 256): @@ -99,38 +83,26 @@ class Sam3VisionConfig(PreTrainedConfig): "backbone_config": AutoConfig, } - def __init__( - self, - backbone_config=None, - fpn_hidden_size=256, - backbone_feature_sizes=None, - scale_factors=None, - hidden_act="gelu", - layer_norm_eps=1e-6, - initializer_range=0.02, - **kwargs, - ): - scale_factors = [4.0, 2.0, 1.0, 0.5] if scale_factors is None else scale_factors - if backbone_feature_sizes is None: - backbone_feature_sizes = [[288, 288], [144, 144], [72, 72]] - - if isinstance(backbone_config, dict): - backbone_config["model_type"] = backbone_config.get("model_type", "sam3_vit_model") - backbone_config = CONFIG_MAPPING[backbone_config["model_type"]](**backbone_config) - elif backbone_config is None: - backbone_config = CONFIG_MAPPING["sam3_vit_model"]() - - self.backbone_config = backbone_config - - # Neck - self.fpn_hidden_size = fpn_hidden_size - self.scale_factors = scale_factors - self.backbone_feature_sizes = backbone_feature_sizes - - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - super().__init__(**kwargs) + backbone_config: dict | PreTrainedConfig | None = None + fpn_hidden_size: int = 256 + backbone_feature_sizes: list | None = None + scale_factors: list[float] | None = None + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + self.scale_factors = [4.0, 2.0, 1.0, 0.5] if self.scale_factors is None else self.scale_factors + if self.backbone_feature_sizes is None: + self.backbone_feature_sizes = [[288, 288], [144, 144], [72, 72]] + + if isinstance(self.backbone_config, dict): + self.backbone_config["model_type"] = self.backbone_config.get("model_type", "sam3_vit_model") + self.backbone_config = CONFIG_MAPPING[self.backbone_config["model_type"]](**self.backbone_config) + elif self.backbone_config is None: + self.backbone_config = CONFIG_MAPPING["sam3_vit_model"]() + + super().__post_init__(**kwargs) @property def image_size(self): @@ -144,6 +116,7 @@ def image_size(self, value): @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3GeometryEncoderConfig(PreTrainedConfig): r""" roi_size (`int`, *optional*, defaults to 7): @@ -152,34 +125,20 @@ class Sam3GeometryEncoderConfig(PreTrainedConfig): model_type = "sam3_geometry_encoder" - def __init__( - self, - hidden_size=256, - num_layers=3, - num_attention_heads=8, - intermediate_size=2048, - dropout=0.1, - hidden_act="relu", - hidden_dropout=0.0, - layer_norm_eps=1e-6, - roi_size=7, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.num_layers = num_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.dropout = dropout - self.hidden_act = hidden_act - self.hidden_dropout = hidden_dropout - self.layer_norm_eps = layer_norm_eps - self.roi_size = roi_size - self.initializer_range = initializer_range + hidden_size: int = 256 + num_layers: int = 3 + num_attention_heads: int = 8 + intermediate_size: int = 2048 + dropout: float | int = 0.1 + hidden_act: str = "relu" + hidden_dropout: float | int = 0.0 + layer_norm_eps: float = 1e-6 + roi_size: int = 7 + initializer_range: float = 0.02 @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3DETREncoderConfig(PreTrainedConfig): r""" hidden_dropout (`float`, *optional*, defaults to 0.0): @@ -188,32 +147,19 @@ class Sam3DETREncoderConfig(PreTrainedConfig): model_type = "sam3_detr_encoder" - def __init__( - self, - hidden_size=256, - num_layers=6, - num_attention_heads=8, - intermediate_size=2048, - dropout=0.1, - hidden_act="relu", - hidden_dropout=0.0, - layer_norm_eps=1e-6, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.num_layers = num_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.dropout = dropout - self.hidden_act = hidden_act - self.hidden_dropout = hidden_dropout - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range + hidden_size: int = 256 + num_layers: int = 6 + num_attention_heads: int = 8 + intermediate_size: int = 2048 + dropout: float | int = 0.1 + hidden_act: str = "relu" + hidden_dropout: float | int = 0.0 + layer_norm_eps: float = 1e-6 + initializer_range: float = 0.02 @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3DETRDecoderConfig(PreTrainedConfig): r""" num_queries (`int`, *optional*, defaults to 200): @@ -222,34 +168,20 @@ class Sam3DETRDecoderConfig(PreTrainedConfig): model_type = "sam3_detr_decoder" - def __init__( - self, - hidden_size=256, - num_layers=6, - num_queries=200, - num_attention_heads=8, - intermediate_size=2048, - dropout=0.1, - hidden_act="relu", - hidden_dropout=0.0, - layer_norm_eps=1e-6, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.num_layers = num_layers - self.num_queries = num_queries - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.dropout = dropout - self.hidden_act = hidden_act - self.hidden_dropout = hidden_dropout - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range + hidden_size: int = 256 + num_layers: int = 6 + num_queries: int = 200 + num_attention_heads: int = 8 + intermediate_size: int = 2048 + dropout: float | int = 0.1 + hidden_act: str = "relu" + hidden_dropout: float | int = 0.0 + layer_norm_eps: float = 1e-6 + initializer_range: float = 0.02 @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3MaskDecoderConfig(PreTrainedConfig): r""" num_upsampling_stages (`int`, *optional*, defaults to 3): @@ -258,26 +190,16 @@ class Sam3MaskDecoderConfig(PreTrainedConfig): model_type = "sam3_mask_decoder" - def __init__( - self, - hidden_size=256, - num_upsampling_stages=3, - layer_norm_eps=1e-6, - dropout=0.0, - num_attention_heads=8, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.num_upsampling_stages = num_upsampling_stages - self.layer_norm_eps = layer_norm_eps - self.dropout = dropout - self.num_attention_heads = num_attention_heads - self.initializer_range = initializer_range + hidden_size: int = 256 + num_upsampling_stages: int = 3 + layer_norm_eps: float = 1e-6 + dropout: float | int = 0.0 + num_attention_heads: int = 8 + initializer_range: float = 0.02 @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3Config(PreTrainedConfig): r""" geometry_encoder_config (`dict` or `Sam3GeometryEncoderConfig`, *optional*): @@ -315,76 +237,57 @@ class Sam3Config(PreTrainedConfig): "mask_decoder_config": Sam3MaskDecoderConfig, } - def __init__( - self, - vision_config=None, - text_config=None, - geometry_encoder_config=None, - detr_encoder_config=None, - detr_decoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - **kwargs, - ): - # Vision config - if vision_config is None: - vision_config = {} - if isinstance(vision_config, dict): - self.vision_config = Sam3VisionConfig(**vision_config) - else: - self.vision_config = vision_config - - # Text config (CLIPTextModelWithProjection defaults) - if text_config is None: - text_config = { - "vocab_size": 49408, - "hidden_size": 1024, - "intermediate_size": 4096, # hidden_size * mlp_ratio (1024 * 4) - "projection_dim": 512, # CLIP's internal projection dimension - "num_hidden_layers": 24, - "num_attention_heads": 16, - "max_position_embeddings": 32, - "hidden_act": "gelu", - } - if isinstance(text_config, dict): - self.text_config = CLIPTextConfig(**text_config) - else: - self.text_config = text_config - - # Geometry encoder config - if geometry_encoder_config is None: - geometry_encoder_config = {} - if isinstance(geometry_encoder_config, dict): - self.geometry_encoder_config = Sam3GeometryEncoderConfig(**geometry_encoder_config) - else: - self.geometry_encoder_config = geometry_encoder_config - - # DETR encoder config - if detr_encoder_config is None: - detr_encoder_config = {} - if isinstance(detr_encoder_config, dict): - self.detr_encoder_config = Sam3DETREncoderConfig(**detr_encoder_config) - else: - self.detr_encoder_config = detr_encoder_config - - # DETR decoder config - if detr_decoder_config is None: - detr_decoder_config = {} - if isinstance(detr_decoder_config, dict): - self.detr_decoder_config = Sam3DETRDecoderConfig(**detr_decoder_config) - else: - self.detr_decoder_config = detr_decoder_config - - # Mask decoder config - if mask_decoder_config is None: - mask_decoder_config = {} - if isinstance(mask_decoder_config, dict): - self.mask_decoder_config = Sam3MaskDecoderConfig(**mask_decoder_config) - else: - self.mask_decoder_config = mask_decoder_config - - self.initializer_range = initializer_range - super().__init__(**kwargs) + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + geometry_encoder_config: dict | PreTrainedConfig | None = None + detr_encoder_config: dict | PreTrainedConfig | None = None + detr_decoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if self.vision_config is None: + self.vision_config = Sam3VisionConfig() + if isinstance(self.vision_config, dict): + self.vision_config = Sam3VisionConfig(**self.vision_config) + + if self.text_config is None: + self.text_config = CLIPTextConfig( + **{ + "vocab_size": 49408, + "hidden_size": 1024, + "intermediate_size": 4096, # hidden_size * mlp_ratio (1024 * 4) + "projection_dim": 512, # CLIP's internal projection dimension + "num_hidden_layers": 24, + "num_attention_heads": 16, + "max_position_embeddings": 32, + "hidden_act": "gelu", + } + ) + if isinstance(self.text_config, dict): + self.text_config = CLIPTextConfig(**self.text_config) + + if self.geometry_encoder_config is None: + self.geometry_encoder_config = Sam3GeometryEncoderConfig() + if isinstance(self.geometry_encoder_config, dict): + self.geometry_encoder_config = Sam3GeometryEncoderConfig(**self.geometry_encoder_config) + + if self.detr_encoder_config is None: + self.detr_encoder_config = Sam3DETREncoderConfig() + if isinstance(self.detr_encoder_config, dict): + self.detr_encoder_config = Sam3DETREncoderConfig(**self.detr_encoder_config) + + if self.detr_decoder_config is None: + self.detr_decoder_config = Sam3DETRDecoderConfig() + if isinstance(self.detr_decoder_config, dict): + self.detr_decoder_config = Sam3DETRDecoderConfig(**self.detr_decoder_config) + + if self.mask_decoder_config is None: + self.mask_decoder_config = Sam3MaskDecoderConfig() + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = Sam3MaskDecoderConfig(**self.mask_decoder_config) + + super().__post_init__(**kwargs) @property def image_size(self): diff --git a/src/transformers/models/sam3_tracker/configuration_sam3_tracker.py b/src/transformers/models/sam3_tracker/configuration_sam3_tracker.py index 553cae852c0f..2a0f09fa845f 100644 --- a/src/transformers/models/sam3_tracker/configuration_sam3_tracker.py +++ b/src/transformers/models/sam3_tracker/configuration_sam3_tracker.py @@ -18,12 +18,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerPromptEncoderConfig(PreTrainedConfig): r""" mask_input_channels (`int`, *optional*, defaults to 16): @@ -36,30 +40,19 @@ class Sam3TrackerPromptEncoderConfig(PreTrainedConfig): base_config_key = "prompt_encoder_config" - def __init__( - self, - hidden_size=256, - image_size=1008, - patch_size=14, - mask_input_channels=16, - num_point_embeddings=4, - hidden_act="gelu", - layer_norm_eps=1e-6, - scale=1, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.image_size = image_size - self.patch_size = patch_size - self.mask_input_channels = mask_input_channels - self.num_point_embeddings = num_point_embeddings - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.scale = scale + hidden_size: int = 256 + + image_size: int | list[int] | tuple[int, int] = 1008 + patch_size: int | list[int] | tuple[int, int] = 14 + mask_input_channels: int = 16 + num_point_embeddings: int = 4 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + scale: int = 1 @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerMaskDecoderConfig(PreTrainedConfig): r""" mlp_dim (`int`, *optional*, defaults to 2048): @@ -82,42 +75,22 @@ class Sam3TrackerMaskDecoderConfig(PreTrainedConfig): base_config_key = "mask_decoder_config" - def __init__( - self, - hidden_size=256, - hidden_act="gelu", - mlp_dim=2048, - num_hidden_layers=2, - num_attention_heads=8, - attention_downsample_rate=2, - num_multimask_outputs=3, - iou_head_depth=3, - iou_head_hidden_dim=256, - dynamic_multimask_via_stability=True, - dynamic_multimask_stability_delta=0.05, - dynamic_multimask_stability_thresh=0.98, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_multimask_outputs = num_multimask_outputs - self.hidden_act = hidden_act - self.iou_head_depth = iou_head_depth - self.iou_head_hidden_dim = iou_head_hidden_dim - self.dynamic_multimask_via_stability = dynamic_multimask_via_stability - self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta - self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh - - # TwoWayTransformer configuration - self.num_hidden_layers = num_hidden_layers - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.mlp_dim = mlp_dim - self.attention_downsample_rate = attention_downsample_rate + hidden_size: int = 256 + hidden_act: str = "gelu" + mlp_dim: int = 2048 + num_hidden_layers: int = 2 + num_attention_heads: int = 8 + attention_downsample_rate: int = 2 + num_multimask_outputs: int = 3 + iou_head_depth: int = 3 + iou_head_hidden_dim: int = 256 + dynamic_multimask_via_stability: bool = True + dynamic_multimask_stability_delta: float = 0.05 + dynamic_multimask_stability_thresh: float = 0.98 @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerConfig(PreTrainedConfig): r""" prompt_encoder_config (Union[`dict`, `Sam3TrackerPromptEncoderConfig`], *optional*): @@ -161,36 +134,31 @@ class Sam3TrackerConfig(PreTrainedConfig): "mask_decoder_config": Sam3TrackerMaskDecoderConfig, } - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - **kwargs, - ): - vision_config = ( - vision_config - if vision_config is not None - else {"backbone_feature_sizes": [[288, 288], [144, 144], [72, 72]]} - ) - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "sam3_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - if isinstance(prompt_encoder_config, Sam3TrackerPromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, Sam3TrackerMaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() - - self.vision_config = vision_config - self.prompt_encoder_config = Sam3TrackerPromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = Sam3TrackerMaskDecoderConfig(**mask_decoder_config) - - self.initializer_range = initializer_range - super().__init__(**kwargs) + vision_config: dict | PreTrainedConfig | None = None + prompt_encoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "sam3_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["sam3_vision_model"]( + backbone_feature_sizes=[[288, 288], [144, 144], [72, 72]] + ) + + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = Sam3TrackerPromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = Sam3TrackerPromptEncoderConfig() + + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = Sam3TrackerMaskDecoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = Sam3TrackerMaskDecoderConfig() + + super().__post_init__(**kwargs) __all__ = ["Sam3TrackerConfig", "Sam3TrackerPromptEncoderConfig", "Sam3TrackerMaskDecoderConfig"] diff --git a/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py b/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py index 651a8551bb92..ec6f9d072382 100644 --- a/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py +++ b/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from collections.abc import Callable from dataclasses import dataclass diff --git a/src/transformers/models/sam3_tracker/modular_sam3_tracker.py b/src/transformers/models/sam3_tracker/modular_sam3_tracker.py index bab158d08a89..3de069e635c5 100644 --- a/src/transformers/models/sam3_tracker/modular_sam3_tracker.py +++ b/src/transformers/models/sam3_tracker/modular_sam3_tracker.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. + import torch +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...configuration_utils import PreTrainedConfig @@ -42,6 +44,7 @@ @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerPromptEncoderConfig(Sam2PromptEncoderConfig): r""" mask_input_channels (`int`, *optional*, defaults to 16): @@ -54,32 +57,22 @@ class Sam3TrackerPromptEncoderConfig(Sam2PromptEncoderConfig): base_config_key = "prompt_encoder_config" - def __init__( - self, - hidden_size=256, - image_size=1008, - patch_size=14, - mask_input_channels=16, - num_point_embeddings=4, - hidden_act="gelu", - layer_norm_eps=1e-6, - scale=1, - **kwargs, - ): - super().__init__(**kwargs) + image_size: int | list[int] | tuple[int, int] = 1008 + patch_size: int | list[int] | tuple[int, int] = 14 -@auto_docstring(checkpoint="facebook/sam3") class Sam3TrackerProcessor(Sam2Processor): pass @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerMaskDecoderConfig(Sam2MaskDecoderConfig): pass @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerConfig(Sam2Config): r""" prompt_encoder_config (Union[`dict`, `Sam3TrackerPromptEncoderConfig`], *optional*): @@ -116,36 +109,26 @@ class Sam3TrackerConfig(Sam2Config): ``` """ - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - **kwargs, - ): - vision_config = ( - vision_config - if vision_config is not None - else {"backbone_feature_sizes": [[288, 288], [144, 144], [72, 72]]} - ) - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "sam3_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - if isinstance(prompt_encoder_config, Sam3TrackerPromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, Sam3TrackerMaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() - - self.vision_config = vision_config - self.prompt_encoder_config = Sam3TrackerPromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = Sam3TrackerMaskDecoderConfig(**mask_decoder_config) - - self.initializer_range = initializer_range - PreTrainedConfig.__init__(**kwargs) + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "sam3_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["sam3_vision_model"]( + backbone_feature_sizes=[[288, 288], [144, 144], [72, 72]] + ) + + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = Sam3TrackerPromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = Sam3TrackerPromptEncoderConfig() + + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = Sam3TrackerMaskDecoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = Sam3TrackerMaskDecoderConfig() + + PreTrainedConfig.__post_init__(**kwargs) class Sam3TrackerImageSegmentationOutput(Sam2ImageSegmentationOutput): diff --git a/src/transformers/models/sam3_tracker/processing_sam3_tracker.py b/src/transformers/models/sam3_tracker/processing_sam3_tracker.py index b48728a805d0..d9beb6ee65da 100644 --- a/src/transformers/models/sam3_tracker/processing_sam3_tracker.py +++ b/src/transformers/models/sam3_tracker/processing_sam3_tracker.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from copy import deepcopy import numpy as np @@ -27,9 +28,11 @@ from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding from ...utils import TensorType, auto_docstring +from ...utils.import_utils import requires -@auto_docstring(checkpoint="facebook/sam3") +@requires(backends=("torch",)) +@auto_docstring class Sam3TrackerProcessor(ProcessorMixin): def __init__(self, image_processor, target_size: int | None = None, point_pad_value: int = -10, **kwargs): r""" diff --git a/src/transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py b/src/transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py index 4b4fbcb81842..799ce486b25c 100644 --- a/src/transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +++ b/src/transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py @@ -19,12 +19,15 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerVideoPromptEncoderConfig(PreTrainedConfig): r""" mask_input_channels (`int`, *optional*, defaults to 16): @@ -37,30 +40,19 @@ class Sam3TrackerVideoPromptEncoderConfig(PreTrainedConfig): base_config_key = "prompt_encoder_config" - def __init__( - self, - hidden_size=256, - image_size=1008, - patch_size=14, - mask_input_channels=16, - num_point_embeddings=4, - hidden_act="gelu", - layer_norm_eps=1e-6, - scale=1, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.image_size = image_size - self.patch_size = patch_size - self.mask_input_channels = mask_input_channels - self.num_point_embeddings = num_point_embeddings - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.scale = scale + hidden_size: int = 256 + + image_size: int | list[int] | tuple[int, int] = 1008 + patch_size: int | list[int] | tuple[int, int] = 14 + mask_input_channels: int = 16 + num_point_embeddings: int = 4 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + scale: int = 1 @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerVideoMaskDecoderConfig(PreTrainedConfig): r""" mlp_dim (`int`, *optional*, defaults to 2048): @@ -83,42 +75,22 @@ class Sam3TrackerVideoMaskDecoderConfig(PreTrainedConfig): base_config_key = "mask_decoder_config" - def __init__( - self, - hidden_size=256, - hidden_act="gelu", - mlp_dim=2048, - num_hidden_layers=2, - num_attention_heads=8, - attention_downsample_rate=2, - num_multimask_outputs=3, - iou_head_depth=3, - iou_head_hidden_dim=256, - dynamic_multimask_via_stability=True, - dynamic_multimask_stability_delta=0.05, - dynamic_multimask_stability_thresh=0.98, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_multimask_outputs = num_multimask_outputs - self.hidden_act = hidden_act - self.iou_head_depth = iou_head_depth - self.iou_head_hidden_dim = iou_head_hidden_dim - self.dynamic_multimask_via_stability = dynamic_multimask_via_stability - self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta - self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh - - # TwoWayTransformer configuration - self.num_hidden_layers = num_hidden_layers - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.mlp_dim = mlp_dim - self.attention_downsample_rate = attention_downsample_rate + hidden_size: int = 256 + hidden_act: str = "gelu" + mlp_dim: int = 2048 + num_hidden_layers: int = 2 + num_attention_heads: int = 8 + attention_downsample_rate: int = 2 + num_multimask_outputs: int = 3 + iou_head_depth: int = 3 + iou_head_hidden_dim: int = 256 + dynamic_multimask_via_stability: bool = True + dynamic_multimask_stability_delta: float = 0.05 + dynamic_multimask_stability_thresh: float = 0.98 @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerVideoConfig(PreTrainedConfig): r""" prompt_encoder_config (Union[`dict`, `Sam3TrackerVideoPromptEncoderConfig`], *optional*): @@ -236,121 +208,72 @@ class Sam3TrackerVideoConfig(PreTrainedConfig): "mask_decoder_config": Sam3TrackerVideoMaskDecoderConfig, } - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - num_maskmem=7, - image_size=1008, - sigmoid_scale_for_mem_enc=20.0, - sigmoid_bias_for_mem_enc=-10.0, - enable_occlusion_spatial_embedding=True, - multimask_output_in_sam=True, - multimask_min_pt_num=0, - multimask_max_pt_num=1, - multimask_output_for_tracking=True, - max_object_pointers_in_encoder=16, - max_cond_frame_num=4, - enable_temporal_pos_encoding_for_object_pointers=True, - # memory attention - memory_attention_hidden_size=256, - memory_attention_num_layers=4, - memory_attention_num_attention_heads=1, - memory_attention_downsample_rate=1, - memory_attention_feed_forward_hidden_size=2048, - memory_attention_feed_forward_hidden_act="relu", - memory_attention_dropout=0.1, - memory_attention_rope_theta=10000, - memory_attention_rope_feat_sizes=None, - memory_attention_rope_dropout=0.1, - # memory encoder - memory_encoder_hidden_size=256, - memory_encoder_output_channels=64, - mask_downsampler_embed_dim=256, - mask_downsampler_kernel_size=3, - mask_downsampler_stride=2, - mask_downsampler_padding=1, - mask_downsampler_total_stride=16, - mask_downsampler_hidden_act="gelu", - memory_fuser_num_layers=2, - memory_fuser_embed_dim=256, - memory_fuser_intermediate_dim=1024, - memory_fuser_kernel_size=7, - memory_fuser_padding=3, - memory_fuser_layer_scale_init_value=1e-6, - memory_fuser_hidden_act="gelu", - **kwargs, - ): - vision_config = ( - vision_config - if vision_config is not None - else {"backbone_feature_sizes": [[288, 288], [144, 144], [72, 72]]} - ) - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - memory_attention_rope_feat_sizes = ( - [72, 72] if memory_attention_rope_feat_sizes is None else memory_attention_rope_feat_sizes + vision_config: dict | PreTrainedConfig | None = None + prompt_encoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: int = 0.02 + num_maskmem: int = 7 + sigmoid_scale_for_mem_enc: float = 20.0 + sigmoid_bias_for_mem_enc: float = -10.0 + enable_occlusion_spatial_embedding: bool = True + multimask_output_in_sam: bool = True + multimask_min_pt_num: int = 0 + multimask_max_pt_num: int = 1 + multimask_output_for_tracking: bool = True + max_object_pointers_in_encoder: int = 16 + max_cond_frame_num: int = 4 + enable_temporal_pos_encoding_for_object_pointers: bool = True + memory_attention_hidden_size: int = 256 + memory_attention_num_layers: int = 4 + memory_attention_num_attention_heads: int = 1 + memory_attention_downsample_rate: int = 1 + memory_attention_feed_forward_hidden_size: int = 2048 + memory_attention_feed_forward_hidden_act: str = "relu" + memory_attention_dropout: float | int = 0.1 + memory_attention_rope_theta: int = 10000 + memory_attention_rope_feat_sizes: list | None = None + memory_attention_rope_dropout: float | int = 0.1 + memory_encoder_hidden_size: int = 256 + memory_encoder_output_channels: int = 64 + mask_downsampler_embed_dim: int = 256 + mask_downsampler_kernel_size: int = 3 + mask_downsampler_stride: int = 2 + mask_downsampler_padding: int = 1 + mask_downsampler_total_stride: int = 16 + mask_downsampler_hidden_act: str = "gelu" + memory_fuser_num_layers: int = 2 + memory_fuser_embed_dim: int = 256 + memory_fuser_intermediate_dim: int = 1024 + memory_fuser_kernel_size: int = 7 + memory_fuser_padding: int = 3 + memory_fuser_layer_scale_init_value: float = 1e-6 + memory_fuser_hidden_act: str = "gelu" + + def __post_init__(self, **kwargs): + self.memory_attention_rope_feat_sizes = ( + [72, 72] if self.memory_attention_rope_feat_sizes is None else self.memory_attention_rope_feat_sizes ) - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "sam3_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - if isinstance(prompt_encoder_config, Sam3TrackerVideoPromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, Sam3TrackerVideoMaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() - - self.vision_config = vision_config - self.prompt_encoder_config = Sam3TrackerVideoPromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = Sam3TrackerVideoMaskDecoderConfig(**mask_decoder_config) - - self.initializer_range = initializer_range - self.num_maskmem = num_maskmem # default 1 input frame + 6 previous frames - self.image_size = image_size - self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc - self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc - self.multimask_output_in_sam = multimask_output_in_sam - self.multimask_min_pt_num = multimask_min_pt_num - self.multimask_max_pt_num = multimask_max_pt_num - self.multimask_output_for_tracking = multimask_output_for_tracking - self.max_object_pointers_in_encoder = max_object_pointers_in_encoder - self.max_cond_frame_num = max_cond_frame_num - # The next 4 are True for sam2.1 and False for sam2 - self.enable_occlusion_spatial_embedding = enable_occlusion_spatial_embedding - self.enable_temporal_pos_encoding_for_object_pointers = enable_temporal_pos_encoding_for_object_pointers - - # memory attention - self.memory_attention_hidden_size = memory_attention_hidden_size - self.memory_attention_num_layers = memory_attention_num_layers - self.memory_attention_num_attention_heads = memory_attention_num_attention_heads - self.memory_attention_downsample_rate = memory_attention_downsample_rate - self.memory_attention_feed_forward_hidden_size = memory_attention_feed_forward_hidden_size - self.memory_attention_feed_forward_hidden_act = memory_attention_feed_forward_hidden_act - self.memory_attention_dropout = memory_attention_dropout - self.memory_attention_rope_theta = memory_attention_rope_theta - self.memory_attention_rope_feat_sizes = memory_attention_rope_feat_sizes - self.memory_attention_rope_dropout = memory_attention_rope_dropout - - # memory encoder - self.memory_encoder_hidden_size = memory_encoder_hidden_size - self.memory_encoder_output_channels = memory_encoder_output_channels - self.mask_downsampler_embed_dim = mask_downsampler_embed_dim - self.mask_downsampler_kernel_size = mask_downsampler_kernel_size - self.mask_downsampler_stride = mask_downsampler_stride - self.mask_downsampler_padding = mask_downsampler_padding - self.mask_downsampler_total_stride = mask_downsampler_total_stride - self.mask_downsampler_hidden_act = mask_downsampler_hidden_act - self.memory_fuser_num_layers = memory_fuser_num_layers - self.memory_fuser_embed_dim = memory_fuser_embed_dim - self.memory_fuser_intermediate_dim = memory_fuser_intermediate_dim - self.memory_fuser_kernel_size = memory_fuser_kernel_size - self.memory_fuser_padding = memory_fuser_padding - self.memory_fuser_layer_scale_init_value = memory_fuser_layer_scale_init_value - self.memory_fuser_hidden_act = memory_fuser_hidden_act - - super().__init__(**kwargs) + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "sam3_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["sam3_vision_model"]( + backbone_feature_sizes=[[288, 288], [144, 144], [72, 72]] + ) + + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = Sam3TrackerVideoPromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = Sam3TrackerVideoPromptEncoderConfig() + + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = Sam3TrackerVideoMaskDecoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = Sam3TrackerVideoMaskDecoderConfig() + + self.image_size = kwargs.pop("image_size", 1008) + super().__post_init__(**kwargs) @property def image_size(self): diff --git a/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py b/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py index ed72163413df..f9aef0a35e03 100644 --- a/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +++ b/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py @@ -14,6 +14,7 @@ import torch +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...processing_utils import Unpack @@ -52,6 +53,7 @@ @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerVideoPromptEncoderConfig(Sam2VideoPromptEncoderConfig): r""" mask_input_channels (`int`, *optional*, defaults to 16): @@ -64,32 +66,22 @@ class Sam3TrackerVideoPromptEncoderConfig(Sam2VideoPromptEncoderConfig): base_config_key = "prompt_encoder_config" - def __init__( - self, - hidden_size=256, - image_size=1008, - patch_size=14, - mask_input_channels=16, - num_point_embeddings=4, - hidden_act="gelu", - layer_norm_eps=1e-6, - scale=1, - **kwargs, - ): - super().__init__(**kwargs) + image_size: int | list[int] | tuple[int, int] = 1008 + patch_size: int | list[int] | tuple[int, int] = 14 -@auto_docstring(checkpoint="facebook/sam3") class Sam3TrackerVideoProcessor(Sam2VideoProcessor): pass @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerVideoMaskDecoderConfig(Sam2VideoMaskDecoderConfig): pass @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3TrackerVideoConfig(PreTrainedConfig): r""" prompt_encoder_config (Union[`dict`, `Sam3TrackerVideoPromptEncoderConfig`], *optional*): @@ -207,121 +199,72 @@ class Sam3TrackerVideoConfig(PreTrainedConfig): "mask_decoder_config": Sam3TrackerVideoMaskDecoderConfig, } - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - num_maskmem=7, - image_size=1008, - sigmoid_scale_for_mem_enc=20.0, - sigmoid_bias_for_mem_enc=-10.0, - enable_occlusion_spatial_embedding=True, - multimask_output_in_sam=True, - multimask_min_pt_num=0, - multimask_max_pt_num=1, - multimask_output_for_tracking=True, - max_object_pointers_in_encoder=16, - max_cond_frame_num=4, - enable_temporal_pos_encoding_for_object_pointers=True, - # memory attention - memory_attention_hidden_size=256, - memory_attention_num_layers=4, - memory_attention_num_attention_heads=1, - memory_attention_downsample_rate=1, - memory_attention_feed_forward_hidden_size=2048, - memory_attention_feed_forward_hidden_act="relu", - memory_attention_dropout=0.1, - memory_attention_rope_theta=10000, - memory_attention_rope_feat_sizes=None, - memory_attention_rope_dropout=0.1, - # memory encoder - memory_encoder_hidden_size=256, - memory_encoder_output_channels=64, - mask_downsampler_embed_dim=256, - mask_downsampler_kernel_size=3, - mask_downsampler_stride=2, - mask_downsampler_padding=1, - mask_downsampler_total_stride=16, - mask_downsampler_hidden_act="gelu", - memory_fuser_num_layers=2, - memory_fuser_embed_dim=256, - memory_fuser_intermediate_dim=1024, - memory_fuser_kernel_size=7, - memory_fuser_padding=3, - memory_fuser_layer_scale_init_value=1e-6, - memory_fuser_hidden_act="gelu", - **kwargs, - ): - vision_config = ( - vision_config - if vision_config is not None - else {"backbone_feature_sizes": [[288, 288], [144, 144], [72, 72]]} - ) - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - memory_attention_rope_feat_sizes = ( - [72, 72] if memory_attention_rope_feat_sizes is None else memory_attention_rope_feat_sizes + vision_config: dict | PreTrainedConfig | None = None + prompt_encoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: int = 0.02 + num_maskmem: int = 7 + sigmoid_scale_for_mem_enc: float = 20.0 + sigmoid_bias_for_mem_enc: float = -10.0 + enable_occlusion_spatial_embedding: bool = True + multimask_output_in_sam: bool = True + multimask_min_pt_num: int = 0 + multimask_max_pt_num: int = 1 + multimask_output_for_tracking: bool = True + max_object_pointers_in_encoder: int = 16 + max_cond_frame_num: int = 4 + enable_temporal_pos_encoding_for_object_pointers: bool = True + memory_attention_hidden_size: int = 256 + memory_attention_num_layers: int = 4 + memory_attention_num_attention_heads: int = 1 + memory_attention_downsample_rate: int = 1 + memory_attention_feed_forward_hidden_size: int = 2048 + memory_attention_feed_forward_hidden_act: str = "relu" + memory_attention_dropout: float | int = 0.1 + memory_attention_rope_theta: int = 10000 + memory_attention_rope_feat_sizes: list | None = None + memory_attention_rope_dropout: float | int = 0.1 + memory_encoder_hidden_size: int = 256 + memory_encoder_output_channels: int = 64 + mask_downsampler_embed_dim: int = 256 + mask_downsampler_kernel_size: int = 3 + mask_downsampler_stride: int = 2 + mask_downsampler_padding: int = 1 + mask_downsampler_total_stride: int = 16 + mask_downsampler_hidden_act: str = "gelu" + memory_fuser_num_layers: int = 2 + memory_fuser_embed_dim: int = 256 + memory_fuser_intermediate_dim: int = 1024 + memory_fuser_kernel_size: int = 7 + memory_fuser_padding: int = 3 + memory_fuser_layer_scale_init_value: float = 1e-6 + memory_fuser_hidden_act: str = "gelu" + + def __post_init__(self, **kwargs): + self.memory_attention_rope_feat_sizes = ( + [72, 72] if self.memory_attention_rope_feat_sizes is None else self.memory_attention_rope_feat_sizes ) - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "sam3_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - if isinstance(prompt_encoder_config, Sam3TrackerVideoPromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, Sam3TrackerVideoMaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() - - self.vision_config = vision_config - self.prompt_encoder_config = Sam3TrackerVideoPromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = Sam3TrackerVideoMaskDecoderConfig(**mask_decoder_config) - - self.initializer_range = initializer_range - self.num_maskmem = num_maskmem # default 1 input frame + 6 previous frames - self.image_size = image_size - self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc - self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc - self.multimask_output_in_sam = multimask_output_in_sam - self.multimask_min_pt_num = multimask_min_pt_num - self.multimask_max_pt_num = multimask_max_pt_num - self.multimask_output_for_tracking = multimask_output_for_tracking - self.max_object_pointers_in_encoder = max_object_pointers_in_encoder - self.max_cond_frame_num = max_cond_frame_num - # The next 4 are True for sam2.1 and False for sam2 - self.enable_occlusion_spatial_embedding = enable_occlusion_spatial_embedding - self.enable_temporal_pos_encoding_for_object_pointers = enable_temporal_pos_encoding_for_object_pointers - - # memory attention - self.memory_attention_hidden_size = memory_attention_hidden_size - self.memory_attention_num_layers = memory_attention_num_layers - self.memory_attention_num_attention_heads = memory_attention_num_attention_heads - self.memory_attention_downsample_rate = memory_attention_downsample_rate - self.memory_attention_feed_forward_hidden_size = memory_attention_feed_forward_hidden_size - self.memory_attention_feed_forward_hidden_act = memory_attention_feed_forward_hidden_act - self.memory_attention_dropout = memory_attention_dropout - self.memory_attention_rope_theta = memory_attention_rope_theta - self.memory_attention_rope_feat_sizes = memory_attention_rope_feat_sizes - self.memory_attention_rope_dropout = memory_attention_rope_dropout - - # memory encoder - self.memory_encoder_hidden_size = memory_encoder_hidden_size - self.memory_encoder_output_channels = memory_encoder_output_channels - self.mask_downsampler_embed_dim = mask_downsampler_embed_dim - self.mask_downsampler_kernel_size = mask_downsampler_kernel_size - self.mask_downsampler_stride = mask_downsampler_stride - self.mask_downsampler_padding = mask_downsampler_padding - self.mask_downsampler_total_stride = mask_downsampler_total_stride - self.mask_downsampler_hidden_act = mask_downsampler_hidden_act - self.memory_fuser_num_layers = memory_fuser_num_layers - self.memory_fuser_embed_dim = memory_fuser_embed_dim - self.memory_fuser_intermediate_dim = memory_fuser_intermediate_dim - self.memory_fuser_kernel_size = memory_fuser_kernel_size - self.memory_fuser_padding = memory_fuser_padding - self.memory_fuser_layer_scale_init_value = memory_fuser_layer_scale_init_value - self.memory_fuser_hidden_act = memory_fuser_hidden_act - - super().__init__(**kwargs) + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "sam3_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["sam3_vision_model"]( + backbone_feature_sizes=[[288, 288], [144, 144], [72, 72]] + ) + + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = Sam3TrackerVideoPromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = Sam3TrackerVideoPromptEncoderConfig() + + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = Sam3TrackerVideoMaskDecoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = Sam3TrackerVideoMaskDecoderConfig() + + self.image_size = kwargs.pop("image_size", 1008) + super().__post_init__(**kwargs) @property def image_size(self): diff --git a/src/transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py b/src/transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py index 168da3d9bfa3..708be51bd5e3 100644 --- a/src/transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +++ b/src/transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py @@ -29,11 +29,13 @@ from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding from ...utils import TensorType, auto_docstring +from ...utils.import_utils import requires from ...video_utils import VideoInput from .modeling_sam3_tracker_video import Sam3TrackerVideoInferenceSession -@auto_docstring(checkpoint="facebook/sam3") +@requires(backends=("torch",)) +@auto_docstring class Sam3TrackerVideoProcessor(ProcessorMixin): def __init__( self, image_processor, video_processor, target_size: int | None = None, point_pad_value: int = -10, **kwargs diff --git a/src/transformers/models/sam3_video/configuration_sam3_video.py b/src/transformers/models/sam3_video/configuration_sam3_video.py index e2f74164a226..b2ecf518f31d 100644 --- a/src/transformers/models/sam3_video/configuration_sam3_video.py +++ b/src/transformers/models/sam3_video/configuration_sam3_video.py @@ -13,6 +13,8 @@ # limitations under the License. """SAM3 Video model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -22,6 +24,7 @@ @auto_docstring(checkpoint="facebook/sam3") +@strict(accept_kwargs=True) class Sam3VideoConfig(PreTrainedConfig): r""" detector_config (`dict` or `Sam3Config`, *optional*): @@ -106,117 +109,58 @@ class Sam3VideoConfig(PreTrainedConfig): "tracker_config": AutoConfig, } - def __init__( - self, - detector_config=None, - tracker_config=None, - initializer_range=0.02, - low_res_mask_size=288, - # Detection-tracking fusion hyperparameters - score_threshold_detection=0.5, - det_nms_thresh=0.1, - assoc_iou_thresh=0.1, - trk_assoc_iou_thresh=0.5, - new_det_thresh=0.7, - recondition_on_trk_masks=True, - # Hotstart parameters - hotstart_delay=15, - hotstart_unmatch_thresh=8, - hotstart_dup_thresh=8, - suppress_unmatched_only_within_hotstart=True, - # Keep-alive parameters - init_trk_keep_alive=30, - max_trk_keep_alive=30, - min_trk_keep_alive=-1, - # Occlusion and overlap handling - suppress_overlapping_based_on_recent_occlusion_threshold=0.7, - decrease_trk_keep_alive_for_empty_masklets=False, - # Mask post-processing - fill_hole_area=16, - # Object tracking limits - max_num_objects=10000, - # Reconditioning parameters - recondition_every_nth_frame=16, - high_conf_thresh=0.8, - high_iou_thresh=0.8, - **kwargs, - ): - super().__init__(**kwargs) - - # Initialize detector config (Sam3) - if detector_config is None: - detector_config = {} + detector_config: dict | PreTrainedConfig | None = None + tracker_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + low_res_mask_size: int = 288 + score_threshold_detection: float = 0.5 + det_nms_thresh: float = 0.1 + assoc_iou_thresh: float = 0.1 + trk_assoc_iou_thresh: float = 0.5 + new_det_thresh: float = 0.7 + recondition_on_trk_masks: bool = True + hotstart_delay: int = 15 + hotstart_unmatch_thresh: int = 8 + hotstart_dup_thresh: int = 8 + suppress_unmatched_only_within_hotstart: bool = True + init_trk_keep_alive: int = 30 + max_trk_keep_alive: int = 30 + min_trk_keep_alive: int = -1 + suppress_overlapping_based_on_recent_occlusion_threshold: float = 0.7 + decrease_trk_keep_alive_for_empty_masklets: bool = False + fill_hole_area: int = 16 + max_num_objects: int = 10000 + recondition_every_nth_frame: int = 16 + high_conf_thresh: float = 0.8 + high_iou_thresh: float = 0.8 + + def __post_init__(self, **kwargs): + if self.detector_config is None: + self.detector_config = CONFIG_MAPPING["sam3"]() logger.info("detector_config is None. Initializing the Sam3Config with default values.") - if isinstance(detector_config, dict): - detector_config["model_type"] = detector_config.get("model_type", "sam3") - self.detector_config = CONFIG_MAPPING[detector_config["model_type"]](**detector_config) - elif isinstance(detector_config, PreTrainedConfig): - self.detector_config = detector_config - else: - raise ValueError(f"detector_config must be a dict or Sam3Config, got {type(detector_config)}") - - # Initialize tracker config (Sam2Video) - if tracker_config is None: - tracker_config = {} + if isinstance(self.detector_config, dict): + self.detector_config["model_type"] = self.detector_config.get("model_type", "sam3") + self.detector_config = CONFIG_MAPPING[self.detector_config["model_type"]](**self.detector_config) + + if self.tracker_config is None: + self.tracker_config = CONFIG_MAPPING["sam3_tracker_video"]() logger.info("tracker_config is None. Initializing the Sam3TrackerVideoConfig with default values.") - if isinstance(tracker_config, dict): - tracker_config["model_type"] = tracker_config.get("model_type", "sam3_tracker_video") - self.tracker_config = CONFIG_MAPPING[tracker_config["model_type"]](**tracker_config) - elif isinstance(tracker_config, PreTrainedConfig): - self.tracker_config = tracker_config - else: - raise ValueError(f"tracker_config must be a dict or Sam3TrackerVideoConfig, got {type(tracker_config)}") - - # Model initialization - self.initializer_range = initializer_range - - self.low_res_mask_size = low_res_mask_size - - # Detection-tracking fusion hyperparameters - self.score_threshold_detection = score_threshold_detection - self.det_nms_thresh = det_nms_thresh - self.assoc_iou_thresh = assoc_iou_thresh - self.trk_assoc_iou_thresh = trk_assoc_iou_thresh - self.new_det_thresh = new_det_thresh - - self.recondition_on_trk_masks = recondition_on_trk_masks - - # Hotstart parameters - if hotstart_delay > 0: - if hotstart_unmatch_thresh > hotstart_delay: + if isinstance(self.tracker_config, dict): + self.tracker_config["model_type"] = self.tracker_config.get("model_type", "sam3_tracker_video") + self.tracker_config = CONFIG_MAPPING[self.tracker_config["model_type"]](**self.tracker_config) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hotstart_delay > 0: + if self.hotstart_unmatch_thresh > self.hotstart_delay: raise ValueError( - f"hotstart_unmatch_thresh ({hotstart_unmatch_thresh}) must be <= hotstart_delay ({hotstart_delay})" + f"hotstart_unmatch_thresh ({self.hotstart_unmatch_thresh}) must be <= hotstart_delay ({self.hotstart_delay})" ) - if hotstart_dup_thresh > hotstart_delay: + if self.hotstart_dup_thresh > self.hotstart_delay: raise ValueError( - f"hotstart_dup_thresh ({hotstart_dup_thresh}) must be <= hotstart_delay ({hotstart_delay})" + f"hotstart_dup_thresh ({self.hotstart_dup_thresh}) must be <= hotstart_delay ({self.hotstart_delay})" ) - self.hotstart_delay = hotstart_delay - self.hotstart_unmatch_thresh = hotstart_unmatch_thresh - self.hotstart_dup_thresh = hotstart_dup_thresh - self.suppress_unmatched_only_within_hotstart = suppress_unmatched_only_within_hotstart - - # Keep-alive parameters - self.init_trk_keep_alive = init_trk_keep_alive - self.max_trk_keep_alive = max_trk_keep_alive - self.min_trk_keep_alive = min_trk_keep_alive - - # Occlusion and overlap handling - self.suppress_overlapping_based_on_recent_occlusion_threshold = ( - suppress_overlapping_based_on_recent_occlusion_threshold - ) - self.decrease_trk_keep_alive_for_empty_masklets = decrease_trk_keep_alive_for_empty_masklets - - # Mask post-processing - self.fill_hole_area = fill_hole_area - - # Object tracking limits - self.max_num_objects = max_num_objects - - # Reconditioning parameters - self.recondition_every_nth_frame = recondition_every_nth_frame - self.high_conf_thresh = high_conf_thresh - self.high_iou_thresh = high_iou_thresh @property def image_size(self): diff --git a/src/transformers/models/sam_hq/configuration_sam_hq.py b/src/transformers/models/sam_hq/configuration_sam_hq.py index ce4af8f16ae2..9ac173d58ec0 100644 --- a/src/transformers/models/sam_hq/configuration_sam_hq.py +++ b/src/transformers/models/sam_hq/configuration_sam_hq.py @@ -18,11 +18,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="syscv-community/sam-hq-vit-base") +@strict(accept_kwargs=True) class SamHQPromptEncoderConfig(PreTrainedConfig): r""" mask_input_channels (`int`, *optional*, defaults to 16): @@ -33,29 +36,21 @@ class SamHQPromptEncoderConfig(PreTrainedConfig): base_config_key = "prompt_encoder_config" - def __init__( - self, - hidden_size=256, - image_size=1024, - patch_size=16, - mask_input_channels=16, - num_point_embeddings=4, - hidden_act="gelu", - layer_norm_eps=1e-6, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.image_size = image_size - self.patch_size = patch_size - self.image_embedding_size = image_size // patch_size - self.mask_input_channels = mask_input_channels - self.num_point_embeddings = num_point_embeddings - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps + hidden_size: int = 256 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + mask_input_channels: int = 16 + num_point_embeddings: int = 4 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-6 + + def __post_init__(self, **kwargs): + self.image_embedding_size = self.image_size // self.patch_size + super().__post_init__(**kwargs) @auto_docstring(checkpoint="syscv-community/sam-hq-vit-base") +@strict(accept_kwargs=True) class SamHQVisionConfig(PreTrainedConfig): r""" output_channels (`int`, *optional*, defaults to 256): @@ -93,54 +88,34 @@ class SamHQVisionConfig(PreTrainedConfig): base_config_key = "vision_config" model_type = "sam_hq_vision_model" - def __init__( - self, - hidden_size=768, - output_channels=256, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=1024, - patch_size=16, - hidden_act="gelu", - layer_norm_eps=1e-06, - attention_dropout=0.0, - initializer_range=1e-10, - qkv_bias=True, - mlp_ratio=4.0, - use_abs_pos=True, - use_rel_pos=True, - window_size=14, - global_attn_indexes=[2, 5, 8, 11], - num_pos_feats=128, - mlp_dim=None, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.output_channels = output_channels - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.image_size = image_size - self.patch_size = patch_size - self.hidden_act = hidden_act - self.layer_norm_eps = layer_norm_eps - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range - self.qkv_bias = qkv_bias - self.mlp_ratio = mlp_ratio - self.use_abs_pos = use_abs_pos - self.use_rel_pos = use_rel_pos - self.window_size = window_size - self.global_attn_indexes = global_attn_indexes - self.num_pos_feats = num_pos_feats - self.mlp_dim = int(hidden_size * mlp_ratio) if mlp_dim is None else mlp_dim + hidden_size: int = 768 + output_channels: int = 256 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 1024 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-06 + attention_dropout: float | int = 0.0 + initializer_range: float = 1e-10 + qkv_bias: bool = True + mlp_ratio: float = 4.0 + use_abs_pos: bool = True + use_rel_pos: bool = True + window_size: int = 14 + global_attn_indexes: list[int] | tuple[int, ...] = (2, 5, 8, 11) + num_pos_feats: int = 128 + mlp_dim: int | None = None + + def __post_init__(self, **kwargs): + self.mlp_dim = int(self.hidden_size * self.mlp_ratio) if self.mlp_dim is None else self.mlp_dim self.scale = self.hidden_size // 2 + super().__post_init__(**kwargs) @auto_docstring(checkpoint="syscv-community/sam-hq-vit-base") +@strict(accept_kwargs=True) class SamHQMaskDecoderConfig(PreTrainedConfig): r""" vit_dim (`int`, *optional*, defaults to 768): @@ -159,36 +134,22 @@ class SamHQMaskDecoderConfig(PreTrainedConfig): base_config_key = "mask_decoder_config" - def __init__( - self, - hidden_size=256, - hidden_act="relu", - mlp_dim=2048, - num_hidden_layers=2, - num_attention_heads=8, - attention_downsample_rate=2, - num_multimask_outputs=3, - iou_head_depth=3, - iou_head_hidden_dim=256, - layer_norm_eps=1e-6, - vit_dim=768, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.mlp_dim = mlp_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.attention_downsample_rate = attention_downsample_rate - self.num_multimask_outputs = num_multimask_outputs - self.iou_head_depth = iou_head_depth - self.iou_head_hidden_dim = iou_head_hidden_dim - self.layer_norm_eps = layer_norm_eps - self.vit_dim = vit_dim + hidden_size: int = 256 + hidden_act: str = "relu" + mlp_dim: int = 2048 + num_hidden_layers: int = 2 + num_attention_heads: int = 8 + attention_downsample_rate: int = 2 + num_multimask_outputs: int = 3 + iou_head_depth: int = 3 + iou_head_hidden_dim: int = 256 + layer_norm_eps: float = 1e-6 + + vit_dim: int = 768 @auto_docstring(checkpoint="syscv-community/sam-hq-vit-base") +@strict(accept_kwargs=True) class SamHQConfig(PreTrainedConfig): r""" prompt_encoder_config (Union[`dict`, `SamHQPromptEncoderConfig`], *optional*): @@ -204,32 +165,29 @@ class SamHQConfig(PreTrainedConfig): "vision_config": SamHQVisionConfig, } - def __init__( - self, - vision_config=None, - prompt_encoder_config=None, - mask_decoder_config=None, - initializer_range=0.02, - tie_word_embeddings=True, - **kwargs, - ): - vision_config = vision_config if vision_config is not None else {} - prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} - mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} - - if isinstance(vision_config, SamHQVisionConfig): - vision_config = vision_config.to_dict() - if isinstance(prompt_encoder_config, SamHQPromptEncoderConfig): - prompt_encoder_config = prompt_encoder_config.to_dict() - if isinstance(mask_decoder_config, SamHQMaskDecoderConfig): - mask_decoder_config = mask_decoder_config.to_dict() - - self.vision_config = SamHQVisionConfig(**vision_config) - self.prompt_encoder_config = SamHQPromptEncoderConfig(**prompt_encoder_config) - self.mask_decoder_config = SamHQMaskDecoderConfig(**mask_decoder_config) - self.initializer_range = initializer_range - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vision_config: dict | PreTrainedConfig | None = None + prompt_encoder_config: dict | PreTrainedConfig | None = None + mask_decoder_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = SamHQVisionConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = SamHQVisionConfig() + + if isinstance(self.prompt_encoder_config, dict): + self.prompt_encoder_config = SamHQPromptEncoderConfig(**self.prompt_encoder_config) + elif self.prompt_encoder_config is None: + self.prompt_encoder_config = SamHQPromptEncoderConfig() + + if isinstance(self.mask_decoder_config, dict): + self.mask_decoder_config = SamHQMaskDecoderConfig(**self.mask_decoder_config) + elif self.mask_decoder_config is None: + self.mask_decoder_config = SamHQMaskDecoderConfig() + + super().__post_init__(**kwargs) __all__ = ["SamHQVisionConfig", "SamHQMaskDecoderConfig", "SamHQPromptEncoderConfig", "SamHQConfig"] diff --git a/src/transformers/models/sam_hq/modular_sam_hq.py b/src/transformers/models/sam_hq/modular_sam_hq.py index 28d32662c7d5..2a2b08fe8957 100644 --- a/src/transformers/models/sam_hq/modular_sam_hq.py +++ b/src/transformers/models/sam_hq/modular_sam_hq.py @@ -15,6 +15,7 @@ from dataclasses import dataclass import torch +from huggingface_hub.dataclasses import strict from torch import nn from ...processing_utils import Unpack @@ -41,16 +42,19 @@ @auto_docstring(checkpoint="syscv-community/sam-hq-vit-base") +@strict(accept_kwargs=True) class SamHQPromptEncoderConfig(SamPromptEncoderConfig): pass @auto_docstring(checkpoint="syscv-community/sam-hq-vit-base") +@strict(accept_kwargs=True) class SamHQVisionConfig(SamVisionConfig): pass @auto_docstring(checkpoint="syscv-community/sam-hq-vit-base") +@strict(accept_kwargs=True) class SamHQMaskDecoderConfig(SamMaskDecoderConfig): r""" vit_dim (`int`, *optional*, defaults to 768): @@ -67,16 +71,11 @@ class SamHQMaskDecoderConfig(SamMaskDecoderConfig): The dimensionality of the hidden states in the IoU head module. """ - def __init__( - self, - vit_dim=768, - **super_kwargs, - ): - super().__init__(**super_kwargs) - self.vit_dim = vit_dim + vit_dim: int = 768 @auto_docstring(checkpoint="syscv-community/sam-hq-vit-base") +@strict(accept_kwargs=True) class SamHQConfig(SamConfig): r""" prompt_encoder_config (Union[`dict`, `SamHQPromptEncoderConfig`], *optional*): diff --git a/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py b/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py index 3c361fdc2fa7..cc230f549d3e 100644 --- a/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py @@ -13,14 +13,14 @@ # limitations under the License. """SeamlessM4T model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/hf-seamless-m4t-medium") +@strict(accept_kwargs=True) class SeamlessM4TConfig(PreTrainedConfig): r""" t2u_vocab_size (`int`, *optional*, defaults to 10082): @@ -154,180 +154,81 @@ class SeamlessM4TConfig(PreTrainedConfig): ```""" model_type = "seamless_m4t" - - def __init__( - self, - vocab_size=256102, - t2u_vocab_size=10082, - # shared config - hidden_size=1024, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - max_position_embeddings=1024, - is_encoder_decoder=True, - encoder_layerdrop=0.05, - decoder_layerdrop=0.05, - activation_function="relu", - dropout=0.1, - attention_dropout=0.1, - activation_dropout=0.0, - scale_embedding=True, - # text encoder|decoder - encoder_layers=24, - encoder_ffn_dim=8192, - encoder_attention_heads=16, - decoder_layers=24, - decoder_ffn_dim=8192, - decoder_attention_heads=16, - decoder_start_token_id=3, - max_new_tokens=256, - pad_token_id=0, - bos_token_id=2, - eos_token_id=3, - # speech_encoder - speech_encoder_layers=24, - speech_encoder_attention_heads=16, - speech_encoder_intermediate_size=4096, - speech_encoder_hidden_act="swish", - speech_encoder_dropout=0.0, - add_adapter=True, - speech_encoder_layerdrop=0.1, - feature_projection_input_dim=160, - num_conv_pos_embeddings=128, - num_conv_pos_embedding_groups=16, - adaptor_kernel_size=8, - adaptor_stride=8, - adaptor_dropout=0.1, - num_adapter_layers=1, - position_embeddings_type="relative", - rotary_embedding_base=10000, - max_source_positions=4096, - conv_depthwise_kernel_size=31, - # t2u config - t2u_bos_token_id=0, - t2u_pad_token_id=1, - t2u_eos_token_id=2, - t2u_decoder_start_token_id=2, - t2u_max_new_tokens=1024, - t2u_encoder_layers=6, - t2u_encoder_ffn_dim=8192, - t2u_encoder_attention_heads=16, - t2u_decoder_layers=6, - t2u_decoder_ffn_dim=8192, - t2u_decoder_attention_heads=16, - t2u_max_position_embeddings=2048, - # hifi-gan vocoder config - sampling_rate=16000, - upsample_initial_channel=512, - upsample_rates=[5, 4, 4, 2, 2], - upsample_kernel_sizes=[11, 8, 8, 4, 4], - resblock_kernel_sizes=[3, 7, 11], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], - leaky_relu_slope=0.1, - # specific to Code Hifi-Gan - unit_hifi_gan_vocab_size=10000, - unit_embed_dim=1280, - lang_embed_dim=256, - spkr_embed_dim=256, - vocoder_num_langs=36, - vocoder_num_spkrs=200, - variance_predictor_kernel_size=3, - var_pred_dropout=0.5, - vocoder_offset=4, - tie_word_embeddings=True, - **kwargs, - ): - # overall_config - self.vocab_size = vocab_size - self.t2u_vocab_size = t2u_vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.max_position_embeddings = max_position_embeddings - self.use_cache = use_cache - self.max_new_tokens = max_new_tokens - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.activation_function = activation_function - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.scale_embedding = scale_embedding - self.tie_word_embeddings = tie_word_embeddings - # for proper config init - self.num_attention_heads = decoder_attention_heads - self.num_hidden_layers = decoder_layers - - # text|unit encoder|decoder - self.encoder_layers = encoder_layers - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_attention_heads = encoder_attention_heads - self.decoder_layers = decoder_layers - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_attention_heads = decoder_attention_heads - - # speech_encoder - self.speech_encoder_layers = speech_encoder_layers - self.speech_encoder_hidden_act = speech_encoder_hidden_act - self.speech_encoder_dropout = speech_encoder_dropout - self.speech_encoder_attention_heads = speech_encoder_attention_heads - self.speech_encoder_layerdrop = speech_encoder_layerdrop - self.speech_encoder_intermediate_size = speech_encoder_intermediate_size - self.feature_projection_input_dim = feature_projection_input_dim - self.num_conv_pos_embeddings = num_conv_pos_embeddings - self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups - self.adaptor_kernel_size = adaptor_kernel_size - self.adaptor_stride = adaptor_stride - self.adaptor_dropout = adaptor_dropout - self.num_adapter_layers = num_adapter_layers - self.position_embeddings_type = position_embeddings_type - self.rotary_embedding_base = rotary_embedding_base - self.max_source_positions = max_source_positions - self.conv_depthwise_kernel_size = conv_depthwise_kernel_size - self.add_adapter = add_adapter - - # t2u config - self.t2u_bos_token_id = t2u_bos_token_id - self.t2u_pad_token_id = t2u_pad_token_id - self.t2u_eos_token_id = t2u_eos_token_id - self.t2u_decoder_start_token_id = t2u_decoder_start_token_id - self.t2u_max_new_tokens = t2u_max_new_tokens - self.t2u_encoder_layers = t2u_encoder_layers - self.t2u_encoder_ffn_dim = t2u_encoder_ffn_dim - self.t2u_encoder_attention_heads = t2u_encoder_attention_heads - self.t2u_decoder_layers = t2u_decoder_layers - self.t2u_decoder_ffn_dim = t2u_decoder_ffn_dim - self.t2u_decoder_attention_heads = t2u_decoder_attention_heads - self.t2u_max_position_embeddings = t2u_max_position_embeddings - - # hifi-gan vocoder config - # original parameters specific to Hifi-Gan - self.sampling_rate = sampling_rate - self.upsample_initial_channel = upsample_initial_channel - self.upsample_rates = upsample_rates - self.upsample_kernel_sizes = upsample_kernel_sizes - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.leaky_relu_slope = leaky_relu_slope - - # specific to Code Hifi-Gan - self.unit_hifi_gan_vocab_size = unit_hifi_gan_vocab_size - self.unit_embed_dim = unit_embed_dim - self.lang_embed_dim = lang_embed_dim - self.spkr_embed_dim = spkr_embed_dim - self.vocoder_num_langs = vocoder_num_langs - self.vocoder_num_spkrs = vocoder_num_spkrs - self.variance_predictor_kernel_size = variance_predictor_kernel_size - self.var_pred_dropout = var_pred_dropout - self.vocoder_offset = vocoder_offset - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.max_position_embeddings = max_position_embeddings - self.decoder_start_token_id = decoder_start_token_id - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + attribute_map = {"num_hidden_layers": "decoder_layers", "num_attention_heads": "decoder_attention_heads"} + + vocab_size: int = 256102 + t2u_vocab_size: int = 10082 + hidden_size: int = 1024 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + use_cache: bool = True + max_position_embeddings: int = 1024 + is_encoder_decoder: bool = True + encoder_layerdrop: float | int = 0.05 + decoder_layerdrop: float | int = 0.05 + activation_function: str = "relu" + dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.0 + scale_embedding: bool = True + encoder_layers: int = 24 + encoder_ffn_dim: int = 8192 + encoder_attention_heads: int = 16 + decoder_layers: int = 24 + decoder_ffn_dim: int = 8192 + decoder_attention_heads: int = 16 + decoder_start_token_id: int = 3 + max_new_tokens: int | None = 256 + pad_token_id: int | None = 0 + bos_token_id: int | None = 2 + eos_token_id: int | None = 3 + speech_encoder_layers: int = 24 + speech_encoder_attention_heads: int = 16 + speech_encoder_intermediate_size: int = 4096 + speech_encoder_hidden_act: str = "swish" + speech_encoder_dropout: float | int = 0.0 + add_adapter: bool = True + speech_encoder_layerdrop: float | int = 0.1 + feature_projection_input_dim: int = 160 + num_conv_pos_embeddings: int = 128 + num_conv_pos_embedding_groups: int = 16 + adaptor_kernel_size: int = 8 + adaptor_stride: int = 8 + adaptor_dropout: float | int = 0.1 + num_adapter_layers: int = 1 + position_embeddings_type: str = "relative" + rotary_embedding_base: int = 10000 + max_source_positions: int = 4096 + conv_depthwise_kernel_size: int = 31 + t2u_bos_token_id: int | None = 0 + t2u_pad_token_id: int | None = 1 + t2u_eos_token_id: int | None = 2 + t2u_decoder_start_token_id: int = 2 + t2u_max_new_tokens: int = 1024 + t2u_encoder_layers: int = 6 + t2u_encoder_ffn_dim: int = 8192 + t2u_encoder_attention_heads: int = 16 + t2u_decoder_layers: int = 6 + t2u_decoder_ffn_dim: int = 8192 + t2u_decoder_attention_heads: int = 16 + t2u_max_position_embeddings: int = 2048 + sampling_rate: int = 16000 + upsample_initial_channel: int = 512 + upsample_rates: list[int] | tuple[int, ...] = (5, 4, 4, 2, 2) + upsample_kernel_sizes: list[int] | tuple[int, ...] = (11, 8, 8, 4, 4) + resblock_kernel_sizes: list[int] | tuple[int, ...] = (3, 7, 11) + resblock_dilation_sizes: list | tuple = ((1, 3, 5), (1, 3, 5), (1, 3, 5)) + leaky_relu_slope: float = 0.1 + unit_hifi_gan_vocab_size: int = 10000 + unit_embed_dim: int = 1280 + lang_embed_dim: int = 256 + spkr_embed_dim: int = 256 + vocoder_num_langs: int = 36 + vocoder_num_spkrs: int = 200 + variance_predictor_kernel_size: int = 3 + var_pred_dropout: float | int = 0.5 + vocoder_offset: int = 4 + tie_word_embeddings: bool = True __all__ = ["SeamlessM4TConfig"] diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py index e37ee7e64548..fb65b7818f66 100755 --- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py @@ -1494,7 +1494,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_features is None: raise ValueError( @@ -1639,7 +1639,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and self.is_t2u_encoder: raise ValueError( @@ -1795,7 +1795,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: @@ -1951,7 +1951,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: encoder_outputs = self.encoder( @@ -2074,7 +2074,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: if use_cache: @@ -2538,7 +2538,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: encoder_outputs = self.text_encoder( @@ -2786,7 +2786,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: encoder_outputs = self.speech_encoder( @@ -3051,7 +3051,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn @@ -3361,7 +3361,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn @@ -3702,7 +3702,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: if use_cache: diff --git a/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py index 79488f3b3da6..b0e1a7f8826c 100644 --- a/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +++ b/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py @@ -13,14 +13,14 @@ # limitations under the License. """SeamlessM4Tv2 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/hf-seamless-m4t-medium") +@strict(accept_kwargs=True) class SeamlessM4Tv2Config(PreTrainedConfig): r""" t2u_vocab_size (`int`, *optional*, defaults to 10082): @@ -157,186 +157,84 @@ class SeamlessM4Tv2Config(PreTrainedConfig): ```""" model_type = "seamless_m4t_v2" - - def __init__( - self, - vocab_size=256102, - t2u_vocab_size=10082, - char_vocab_size=10943, - # shared config - hidden_size=1024, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - max_position_embeddings=4096, - is_encoder_decoder=True, - encoder_layerdrop=0.05, - decoder_layerdrop=0.05, - activation_function="relu", - dropout=0.1, - attention_dropout=0.1, - activation_dropout=0.0, - scale_embedding=True, - # text encoder|decoder - encoder_layers=24, - encoder_ffn_dim=8192, - encoder_attention_heads=16, - decoder_layers=24, - decoder_ffn_dim=8192, - decoder_attention_heads=16, - decoder_start_token_id=3, - max_new_tokens=256, - pad_token_id=0, - bos_token_id=2, - eos_token_id=3, - # speech_encoder - speech_encoder_layers=24, - speech_encoder_attention_heads=16, - speech_encoder_intermediate_size=4096, - speech_encoder_hidden_act="swish", - speech_encoder_dropout=0.0, - add_adapter=True, - speech_encoder_layerdrop=0.1, - feature_projection_input_dim=160, - adaptor_kernel_size=8, - adaptor_stride=8, - adaptor_dropout=0.1, - num_adapter_layers=1, - position_embeddings_type="relative_key", - conv_depthwise_kernel_size=31, - left_max_position_embeddings=64, - right_max_position_embeddings=8, - speech_encoder_chunk_size=20000, - speech_encoder_left_chunk_num=128, - # t2u config - t2u_bos_token_id=0, - t2u_pad_token_id=1, - t2u_eos_token_id=2, - t2u_encoder_layers=6, - t2u_encoder_ffn_dim=8192, - t2u_encoder_attention_heads=16, - t2u_decoder_layers=6, - t2u_decoder_ffn_dim=8192, - t2u_decoder_attention_heads=16, - t2u_max_position_embeddings=4096, - t2u_variance_predictor_embed_dim=1024, - t2u_variance_predictor_hidden_dim=256, - t2u_variance_predictor_kernel_size=3, - t2u_variance_pred_dropout=0.5, - # hifi-gan vocoder config - sampling_rate=16000, - upsample_initial_channel=512, - upsample_rates=[5, 4, 4, 2, 2], - upsample_kernel_sizes=[11, 8, 8, 4, 4], - resblock_kernel_sizes=[3, 7, 11], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], - leaky_relu_slope=0.1, - # specific to Code Hifi-Gan - unit_hifi_gan_vocab_size=10000, - unit_embed_dim=1280, - lang_embed_dim=256, - spkr_embed_dim=256, - vocoder_num_langs=36, - vocoder_num_spkrs=200, - variance_predictor_kernel_size=3, - var_pred_dropout=0.5, - vocoder_offset=4, - tie_word_embeddings=True, - **kwargs, - ): - # overall_config - self.vocab_size = vocab_size - self.t2u_vocab_size = t2u_vocab_size - self.char_vocab_size = char_vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.max_position_embeddings = max_position_embeddings - self.use_cache = use_cache - self.max_new_tokens = max_new_tokens - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.activation_function = activation_function - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.scale_embedding = scale_embedding - self.tie_word_embeddings = tie_word_embeddings - # for proper config init - self.num_attention_heads = decoder_attention_heads - self.num_hidden_layers = decoder_layers - - # text|unit encoder|decoder - self.encoder_layers = encoder_layers - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_attention_heads = encoder_attention_heads - self.decoder_layers = decoder_layers - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_attention_heads = decoder_attention_heads - - # speech_encoder - self.speech_encoder_layers = speech_encoder_layers - self.speech_encoder_hidden_act = speech_encoder_hidden_act - self.speech_encoder_dropout = speech_encoder_dropout - self.speech_encoder_attention_heads = speech_encoder_attention_heads - self.speech_encoder_layerdrop = speech_encoder_layerdrop - self.speech_encoder_intermediate_size = speech_encoder_intermediate_size - self.feature_projection_input_dim = feature_projection_input_dim - self.adaptor_kernel_size = adaptor_kernel_size - self.adaptor_stride = adaptor_stride - self.adaptor_dropout = adaptor_dropout - self.num_adapter_layers = num_adapter_layers - self.position_embeddings_type = position_embeddings_type - self.conv_depthwise_kernel_size = conv_depthwise_kernel_size - self.add_adapter = add_adapter - self.left_max_position_embeddings = left_max_position_embeddings - self.right_max_position_embeddings = right_max_position_embeddings - self.speech_encoder_chunk_size = speech_encoder_chunk_size - self.speech_encoder_left_chunk_num = speech_encoder_left_chunk_num - - # t2u config - self.t2u_bos_token_id = t2u_bos_token_id - self.t2u_pad_token_id = t2u_pad_token_id - self.t2u_eos_token_id = t2u_eos_token_id - self.t2u_encoder_layers = t2u_encoder_layers - self.t2u_encoder_ffn_dim = t2u_encoder_ffn_dim - self.t2u_encoder_attention_heads = t2u_encoder_attention_heads - self.t2u_decoder_layers = t2u_decoder_layers - self.t2u_decoder_ffn_dim = t2u_decoder_ffn_dim - self.t2u_decoder_attention_heads = t2u_decoder_attention_heads - self.t2u_max_position_embeddings = t2u_max_position_embeddings - self.t2u_variance_predictor_embed_dim = t2u_variance_predictor_embed_dim # TODO: add to docstrings - self.t2u_variance_predictor_hidden_dim = t2u_variance_predictor_hidden_dim # TODO: add to docstrings - self.t2u_variance_predictor_kernel_size = t2u_variance_predictor_kernel_size # TODO: add to docstrings - self.t2u_variance_pred_dropout = t2u_variance_pred_dropout # TODO: add to docstrings - - # hifi-gan vocoder config - # original parameters specific to Hifi-Gan - self.sampling_rate = sampling_rate - self.upsample_initial_channel = upsample_initial_channel - self.upsample_rates = upsample_rates - self.upsample_kernel_sizes = upsample_kernel_sizes - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.leaky_relu_slope = leaky_relu_slope - - # specific to Code Hifi-Gan - self.unit_hifi_gan_vocab_size = unit_hifi_gan_vocab_size - self.unit_embed_dim = unit_embed_dim - self.lang_embed_dim = lang_embed_dim - self.spkr_embed_dim = spkr_embed_dim - self.vocoder_num_langs = vocoder_num_langs - self.vocoder_num_spkrs = vocoder_num_spkrs - self.variance_predictor_kernel_size = variance_predictor_kernel_size - self.var_pred_dropout = var_pred_dropout - self.vocoder_offset = vocoder_offset - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.max_position_embeddings = max_position_embeddings - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + attribute_map = {"num_hidden_layers": "decoder_layers", "num_attention_heads": "decoder_attention_heads"} + + vocab_size: int = 256102 + t2u_vocab_size: int = 10082 + char_vocab_size: int = 10943 + hidden_size: int = 1024 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + use_cache: bool = True + max_position_embeddings: int = 4096 + is_encoder_decoder: bool = True + encoder_layerdrop: float | int = 0.05 + decoder_layerdrop: float | int = 0.05 + activation_function: str = "relu" + dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.0 + scale_embedding: bool = True + encoder_layers: int = 24 + encoder_ffn_dim: int = 8192 + encoder_attention_heads: int = 16 + decoder_layers: int = 24 + decoder_ffn_dim: int = 8192 + decoder_attention_heads: int = 16 + decoder_start_token_id: int = 3 + max_new_tokens: int | None = 256 + pad_token_id: int | None = 0 + bos_token_id: int | None = 2 + eos_token_id: int | None = 3 + speech_encoder_layers: int = 24 + speech_encoder_attention_heads: int = 16 + speech_encoder_intermediate_size: int = 4096 + speech_encoder_hidden_act: str = "swish" + speech_encoder_dropout: float | int = 0.0 + add_adapter: bool = True + speech_encoder_layerdrop: float | int = 0.1 + feature_projection_input_dim: int = 160 + adaptor_kernel_size: int = 8 + adaptor_stride: int = 8 + adaptor_dropout: float | int = 0.1 + num_adapter_layers: int = 1 + position_embeddings_type: str = "relative_key" + conv_depthwise_kernel_size: int = 31 + left_max_position_embeddings: int = 64 + right_max_position_embeddings: int = 8 + speech_encoder_chunk_size: int = 20000 + speech_encoder_left_chunk_num: int = 128 + t2u_bos_token_id: int | None = 0 + t2u_pad_token_id: int | None = 1 + t2u_eos_token_id: int | None = 2 + t2u_encoder_layers: int = 6 + t2u_encoder_ffn_dim: int = 8192 + t2u_encoder_attention_heads: int = 16 + t2u_decoder_layers: int = 6 + t2u_decoder_ffn_dim: int = 8192 + t2u_decoder_attention_heads: int = 16 + t2u_max_position_embeddings: int = 4096 + t2u_variance_predictor_embed_dim: int = 1024 + t2u_variance_predictor_hidden_dim: int = 256 + t2u_variance_predictor_kernel_size: int = 3 + t2u_variance_pred_dropout: float | int = 0.5 + sampling_rate: int = 16000 + upsample_initial_channel: int = 512 + upsample_rates: list[int] | tuple[int, ...] = (5, 4, 4, 2, 2) + upsample_kernel_sizes: list[int] | tuple[int, ...] = (11, 8, 8, 4, 4) + resblock_kernel_sizes: list[int] | tuple[int, ...] = (3, 7, 11) + resblock_dilation_sizes: list | tuple = ((1, 3, 5), (1, 3, 5), (1, 3, 5)) + leaky_relu_slope: float = 0.1 + unit_hifi_gan_vocab_size: int = 10000 + unit_embed_dim: int = 1280 + lang_embed_dim: int = 256 + spkr_embed_dim: int = 256 + vocoder_num_langs: int = 36 + vocoder_num_spkrs: int = 200 + variance_predictor_kernel_size: int = 3 + var_pred_dropout: float | int = 0.5 + vocoder_offset: int = 4 + tie_word_embeddings: bool = True __all__ = ["SeamlessM4Tv2Config"] diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py index cb4113da7326..22100b2094e9 100644 --- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py @@ -1527,7 +1527,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_features is None: raise ValueError( @@ -1673,7 +1673,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and self.is_t2u_encoder: raise ValueError( @@ -1830,7 +1830,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: @@ -2042,7 +2042,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # create padding mask for character lengths char_padding_mask = _compute_new_attention_mask(char_input_ids, char_count_per_id.sum(1)) @@ -2155,7 +2155,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: encoder_outputs = self.encoder( @@ -2282,7 +2282,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.model( input_ids, @@ -2739,7 +2739,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: encoder_outputs = self.text_encoder( @@ -2993,7 +2993,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: encoder_outputs = self.speech_encoder( @@ -3265,7 +3265,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn @@ -3612,7 +3612,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if encoder_outputs is None: # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn @@ -3989,7 +3989,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: if use_cache: diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py index f1d6771ac149..022e3755ea82 100644 --- a/src/transformers/models/seed_oss/configuration_seed_oss.py +++ b/src/transformers/models/seed_oss/configuration_seed_oss.py @@ -13,12 +13,15 @@ # limitations under the License. """SeedOss model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="ByteDance-Seed/Seed-OSS-36B-Instruct") +@strict(accept_kwargs=True) class SeedOssConfig(PreTrainedConfig): r""" attention_out_bias (`bool`, *optional*, defaults to `False`): @@ -55,62 +58,36 @@ class SeedOssConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 155136, - hidden_size: int | None = 4096, - intermediate_size: int | None = 27648, - num_hidden_layers: int | None = 64, - num_attention_heads: int | None = 80, - num_key_value_heads: int | None = 8, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 524288, - initializer_range: float | None = 0.02, - rms_norm_eps: float | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 1, - bos_token_id: int | None = 0, - eos_token_id: int | None = 2, - pretraining_tp: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = True, - attention_out_bias: bool | None = False, - attention_dropout: float | None = 0.1, - residual_dropout: float | None = 0.1, - mlp_bias: bool | None = False, - head_dim: int | None = 128, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + vocab_size: int = 155136 + hidden_size: int = 4096 + intermediate_size: int = 27648 + num_hidden_layers: int = 64 + num_attention_heads: int = 80 + num_key_value_heads: int | None = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 524288 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + pretraining_tp: int = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = True + attention_out_bias: bool = False + attention_dropout: float | int = 0.1 + residual_dropout: float | int = 0.1 + mlp_bias: bool = False + head_dim: int | None = 128 - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_out_bias = attention_out_bias - self.attention_dropout = attention_dropout - self.residual_dropout = residual_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + super().__post_init__(**kwargs) __all__ = ["SeedOssConfig"] diff --git a/src/transformers/models/segformer/configuration_segformer.py b/src/transformers/models/segformer/configuration_segformer.py index fb921f59d365..d74d38c04a25 100644 --- a/src/transformers/models/segformer/configuration_segformer.py +++ b/src/transformers/models/segformer/configuration_segformer.py @@ -13,14 +13,14 @@ # limitations under the License. """SegFormer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="ByteDance-Seed/Seed-OSS-36B-Instruct") +@strict(accept_kwargs=True) class SegformerConfig(PreTrainedConfig): r""" num_encoder_blocks (`int`, *optional*, defaults to 4): @@ -36,6 +36,8 @@ class SegformerConfig(PreTrainedConfig): mlp_ratios (`list[int]`, *optional*, defaults to `[4, 4, 4, 4]`): Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the encoder blocks. + reshape_last_stage (`bool`, *optional*, defaults to True): + Whether to reshape the last stage outputs Example: @@ -54,49 +56,25 @@ class SegformerConfig(PreTrainedConfig): model_type = "segformer" - def __init__( - self, - num_channels=3, - num_encoder_blocks=4, - depths=[2, 2, 2, 2], - sr_ratios=[8, 4, 2, 1], - hidden_sizes=[32, 64, 160, 256], - patch_sizes=[7, 3, 3, 3], - strides=[4, 2, 2, 2], - num_attention_heads=[1, 2, 5, 8], - mlp_ratios=[4, 4, 4, 4], - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - classifier_dropout_prob=0.1, - initializer_range=0.02, - drop_path_rate=0.1, - layer_norm_eps=1e-6, - decoder_hidden_size=256, - semantic_loss_ignore_index=255, - **kwargs, - ): - super().__init__(**kwargs) - - self.num_channels = num_channels - self.num_encoder_blocks = num_encoder_blocks - self.depths = depths - self.sr_ratios = sr_ratios - self.hidden_sizes = hidden_sizes - self.patch_sizes = patch_sizes - self.strides = strides - self.mlp_ratios = mlp_ratios - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.classifier_dropout_prob = classifier_dropout_prob - self.initializer_range = initializer_range - self.drop_path_rate = drop_path_rate - self.layer_norm_eps = layer_norm_eps - self.decoder_hidden_size = decoder_hidden_size - self.reshape_last_stage = kwargs.get("reshape_last_stage", True) - self.semantic_loss_ignore_index = semantic_loss_ignore_index + num_channels: int = 3 + num_encoder_blocks: int = 4 + depths: list[int] | tuple[int, ...] = (2, 2, 2, 2) + sr_ratios: list[int] | tuple[int, ...] = (8, 4, 2, 1) + hidden_sizes: list[int] | tuple[int, ...] = (32, 64, 160, 256) + patch_sizes: list[int] | tuple[int, ...] = (7, 3, 3, 3) + strides: list[int] | tuple[int, ...] = (4, 2, 2, 2) + num_attention_heads: list[int] | tuple[int, ...] = (1, 2, 5, 8) + mlp_ratios: list[int] | tuple[int, ...] = (4, 4, 4, 4) + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + classifier_dropout_prob: float = 0.1 + initializer_range: float = 0.02 + drop_path_rate: float = 0.1 + layer_norm_eps: float = 1e-6 + decoder_hidden_size: int = 256 + semantic_loss_ignore_index: int = 255 + reshape_last_stage: bool = True __all__ = ["SegformerConfig"] diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py index 0a5c2a13515d..10d82a587d06 100755 --- a/src/transformers/models/segformer/modeling_segformer.py +++ b/src/transformers/models/segformer/modeling_segformer.py @@ -438,7 +438,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict encoder_outputs = self.encoder( pixel_values, @@ -493,7 +493,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.segformer( pixel_values, @@ -655,7 +655,7 @@ def forward( >>> list(logits.shape) [1, 150, 128, 128] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/seggpt/configuration_seggpt.py b/src/transformers/models/seggpt/configuration_seggpt.py index 409a6af32c51..58da9dcb8d08 100644 --- a/src/transformers/models/seggpt/configuration_seggpt.py +++ b/src/transformers/models/seggpt/configuration_seggpt.py @@ -13,14 +13,14 @@ # limitations under the License. """SegGpt model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="BAAI/seggpt-vit-large") +@strict(accept_kwargs=True) class SegGptConfig(PreTrainedConfig): r""" mlp_dim (`int`, *optional*): @@ -54,54 +54,36 @@ class SegGptConfig(PreTrainedConfig): model_type = "seggpt" - def __init__( - self, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - hidden_act="gelu", - hidden_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=[896, 448], - patch_size=16, - num_channels=3, - qkv_bias=True, - mlp_dim=None, - drop_path_rate=0.1, - pretrain_image_size=224, - decoder_hidden_size=64, - use_relative_position_embeddings=True, - merge_index=2, - intermediate_hidden_state_indices=[5, 11, 17, 23], - beta=0.01, - **kwargs, - ): - super().__init__(**kwargs) + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + image_size: int | list[int] | tuple[int, ...] = (896, 448) + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + mlp_dim: int | None = None + drop_path_rate: float = 0.1 + pretrain_image_size: int | list[int] | tuple[int, int] = 224 + decoder_hidden_size: int = 64 + use_relative_position_embeddings: bool = True + merge_index: int = 2 + intermediate_hidden_state_indices: list[int] | tuple[int, ...] = (5, 11, 17, 23) + beta: float = 0.01 + + def __post_init__(self, **kwargs): + self.mlp_dim = int(self.hidden_size * 4) if self.mlp_dim is None else self.mlp_dim + super().__post_init__(**kwargs) - if merge_index > min(intermediate_hidden_state_indices): + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.merge_index > min(self.intermediate_hidden_state_indices): raise ValueError( - f"Merge index must be less than the minimum encoder output index, but got {merge_index=} and {intermediate_hidden_state_indices=}" + f"Merge index must be less than the minimum encoder output index, but got {self.merge_index=} and {self.intermediate_hidden_state_indices=}" ) - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.drop_path_rate = drop_path_rate - self.pretrain_image_size = pretrain_image_size - self.decoder_hidden_size = decoder_hidden_size - self.use_relative_position_embeddings = use_relative_position_embeddings - self.merge_index = merge_index - self.intermediate_hidden_state_indices = intermediate_hidden_state_indices - self.beta = beta - self.mlp_dim = int(hidden_size * 4) if mlp_dim is None else mlp_dim __all__ = ["SegGptConfig"] diff --git a/src/transformers/models/seggpt/modeling_seggpt.py b/src/transformers/models/seggpt/modeling_seggpt.py index c51383da8b63..0ba961989adc 100644 --- a/src/transformers/models/seggpt/modeling_seggpt.py +++ b/src/transformers/models/seggpt/modeling_seggpt.py @@ -702,7 +702,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict feature_ensemble = feature_ensemble if feature_ensemble is not None else False expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype @@ -905,7 +905,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if bool_masked_pos is None: num_patches = self.model.embeddings.patch_embeddings.num_patches diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py index 657705553207..9c158e88d170 100644 --- a/src/transformers/models/sew/configuration_sew.py +++ b/src/transformers/models/sew/configuration_sew.py @@ -16,14 +16,14 @@ import functools import operator -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="BAAI/seggpt-vit-large") +@strict(accept_kwargs=True) class SEWConfig(PreTrainedConfig): r""" squeeze_factor (`int`, *optional*, defaults to 2): @@ -112,76 +112,50 @@ class SEWConfig(PreTrainedConfig): model_type = "sew" - def __init__( - self, - vocab_size=32, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - squeeze_factor=2, - hidden_act="gelu", - hidden_dropout=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - feat_proj_dropout=0.0, - final_dropout=0.1, - layerdrop=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - feat_extract_norm="group", - feat_extract_activation="gelu", - conv_dim=(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512), - conv_stride=(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1), - conv_kernel=(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1), - conv_bias=False, - num_conv_pos_embeddings=128, - num_conv_pos_embedding_groups=16, - apply_spec_augment=True, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - mask_feature_min_masks=0, - ctc_loss_reduction="mean", - ctc_zero_infinity=False, - use_weighted_layer_sum=False, - classifier_proj_size=256, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.hidden_size = hidden_size - self.feat_extract_norm = feat_extract_norm - self.feat_extract_activation = feat_extract_activation - self.conv_dim = list(conv_dim) - self.conv_stride = list(conv_stride) - self.conv_kernel = list(conv_kernel) - self.conv_bias = conv_bias - self.num_conv_pos_embeddings = num_conv_pos_embeddings - self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + vocab_size: int = 32 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + squeeze_factor: int = 2 + hidden_act: str = "gelu" + hidden_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + feat_proj_dropout: float | int = 0.0 + final_dropout: float | int = 0.1 + layerdrop: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + feat_extract_norm: str = "group" + feat_extract_activation: str = "gelu" + conv_dim: list[int] | tuple[int, ...] = (64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512) + conv_stride: list[int] | tuple[int, ...] = (5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1) + conv_kernel: list[int] | tuple[int, ...] = (10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1) + conv_bias: bool = False + num_conv_pos_embeddings: int = 128 + num_conv_pos_embedding_groups: int = 16 + apply_spec_augment: bool = True + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + mask_feature_min_masks: int = 0 + ctc_loss_reduction: str = "mean" + ctc_zero_infinity: bool = False + use_weighted_layer_sum: bool = False + classifier_proj_size: int = 256 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + + def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.squeeze_factor = squeeze_factor - self.hidden_act = hidden_act - self.num_attention_heads = num_attention_heads - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.feat_proj_dropout = feat_proj_dropout - self.final_dropout = final_dropout - self.layerdrop = layerdrop - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.vocab_size = vocab_size + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if ( (len(self.conv_stride) != self.num_feat_extract_layers) or (len(self.conv_kernel) != self.num_feat_extract_layers) @@ -194,23 +168,6 @@ def __init__( f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`." ) - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.apply_spec_augment = apply_spec_augment - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - self.mask_feature_min_masks = mask_feature_min_masks - - # ctc loss - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - - # sequence classification - self.use_weighted_layer_sum = use_weighted_layer_sum - self.classifier_proj_size = classifier_proj_size - @property def inputs_to_logits_ratio(self): return functools.reduce(operator.mul, self.conv_stride, 1) diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 8f70f51e3909..79070d634608 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -782,7 +782,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) @@ -913,7 +913,7 @@ def forward( All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -1031,7 +1031,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.sew( diff --git a/src/transformers/models/sew/modular_sew.py b/src/transformers/models/sew/modular_sew.py index fa958f4dfe24..312419793a34 100644 --- a/src/transformers/models/sew/modular_sew.py +++ b/src/transformers/models/sew/modular_sew.py @@ -402,7 +402,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) diff --git a/src/transformers/models/sew_d/configuration_sew_d.py b/src/transformers/models/sew_d/configuration_sew_d.py index 15a8d3dc884a..a5907624c4a0 100644 --- a/src/transformers/models/sew_d/configuration_sew_d.py +++ b/src/transformers/models/sew_d/configuration_sew_d.py @@ -16,14 +16,14 @@ import functools import operator -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="BAAI/seggpt-vit-large") +@strict(accept_kwargs=True) class SEWDConfig(PreTrainedConfig): r""" squeeze_factor (`int`, *optional*, defaults to 2): @@ -126,89 +126,56 @@ class SEWDConfig(PreTrainedConfig): ```""" model_type = "sew-d" - - def __init__( - self, - vocab_size=32, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - squeeze_factor=2, - max_position_embeddings=512, - position_buckets=256, - share_att_key=True, - relative_attention=True, - pos_att_type=("p2c", "c2p"), - norm_rel_ebd="layer_norm", - hidden_act="gelu_python", - hidden_dropout=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - feat_proj_dropout=0.0, - final_dropout=0.1, - initializer_range=0.02, - layer_norm_eps=1e-7, - feature_layer_norm_eps=1e-5, - feat_extract_norm="group", - feat_extract_activation="gelu", - conv_dim=(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512), - conv_stride=(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1), - conv_kernel=(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1), - conv_bias=False, - num_conv_pos_embeddings=128, - num_conv_pos_embedding_groups=16, - apply_spec_augment=True, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - mask_feature_min_masks=0, - ctc_loss_reduction="mean", - ctc_zero_infinity=False, - use_weighted_layer_sum=False, - classifier_proj_size=256, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.hidden_size = hidden_size - self.feat_extract_norm = feat_extract_norm - self.feat_extract_activation = feat_extract_activation - self.conv_dim = list(conv_dim) - self.conv_stride = list(conv_stride) - self.conv_kernel = list(conv_kernel) - self.conv_bias = conv_bias - self.num_conv_pos_embeddings = num_conv_pos_embeddings - self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + vocab_size: int = 32 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + squeeze_factor: int = 2 + max_position_embeddings: int = 512 + position_buckets: int = 256 + share_att_key: bool = True + relative_attention: bool = True + pos_att_type: list[str] | tuple[str, ...] = ("p2c", "c2p") + norm_rel_ebd: str = "layer_norm" + hidden_act: str = "gelu_python" + hidden_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + feat_proj_dropout: float | int = 0.0 + final_dropout: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-7 + feature_layer_norm_eps: float = 1e-5 + feat_extract_norm: str = "group" + feat_extract_activation: str = "gelu" + conv_dim: list[int] | tuple[int, ...] = (64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512) + conv_stride: list[int] | tuple[int, ...] = (5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1) + conv_kernel: list[int] | tuple[int, ...] = (10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1) + conv_bias: bool = False + num_conv_pos_embeddings: int = 128 + num_conv_pos_embedding_groups: int = 16 + apply_spec_augment: bool = True + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + mask_feature_min_masks: int = 0 + ctc_loss_reduction: str = "mean" + ctc_zero_infinity: bool = False + use_weighted_layer_sum: bool = False + classifier_proj_size: int = 256 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + + def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.squeeze_factor = squeeze_factor - self.max_position_embeddings = max_position_embeddings - self.position_buckets = position_buckets - self.share_att_key = share_att_key - self.relative_attention = relative_attention - self.norm_rel_ebd = norm_rel_ebd - self.pos_att_type = list(pos_att_type) - self.hidden_act = hidden_act - self.num_attention_heads = num_attention_heads - self._hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.feat_proj_dropout = feat_proj_dropout - self.final_dropout = final_dropout - self.layer_norm_eps = layer_norm_eps - self.feature_layer_norm_eps = feature_layer_norm_eps - self.initializer_range = initializer_range - self.vocab_size = vocab_size + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if ( (len(self.conv_stride) != self.num_feat_extract_layers) or (len(self.conv_kernel) != self.num_feat_extract_layers) @@ -221,34 +188,9 @@ def __init__( f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`." ) - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.apply_spec_augment = apply_spec_augment - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - self.mask_feature_min_masks = mask_feature_min_masks - - # ctc loss - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - - # sequence classification - self.use_weighted_layer_sum = use_weighted_layer_sum - self.classifier_proj_size = classifier_proj_size - @property def inputs_to_logits_ratio(self): return functools.reduce(operator.mul, self.conv_stride, 1) - def to_dict(self): - """ - Serializes this instance to a Python dictionary. - """ - output = super().to_dict() - output["hidden_dropout"] = output.pop("_hidden_dropout") - return output - __all__ = ["SEWDConfig"] diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 7e3872b68978..bc573ee73c96 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1326,7 +1326,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) @@ -1455,7 +1455,7 @@ def forward( All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -1574,7 +1574,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.sew_d( diff --git a/src/transformers/models/shieldgemma2/configuration_shieldgemma2.py b/src/transformers/models/shieldgemma2/configuration_shieldgemma2.py index 8619f26aa144..9ad2f2822adf 100644 --- a/src/transformers/models/shieldgemma2/configuration_shieldgemma2.py +++ b/src/transformers/models/shieldgemma2/configuration_shieldgemma2.py @@ -13,15 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging -from ..auto import CONFIG_MAPPING, AutoConfig +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring +from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="google/shieldgemma-2-4b-it") +@strict(accept_kwargs=True) class ShieldGemma2Config(PreTrainedConfig): r""" mm_tokens_per_image (`int`, *optional*, defaults to 256): @@ -61,43 +62,30 @@ class ShieldGemma2Config(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - text_config=None, - vision_config=None, - mm_tokens_per_image: int = 256, - boi_token_index: int = 255_999, - eoi_token_index: int = 256_000, - image_token_index: int = 262_144, - initializer_range: float = 0.02, - tie_word_embeddings=None, - **kwargs, - ): - if isinstance(vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") - vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["siglip_vision_model"]() - - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "gemma3_text") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["gemma3_text"]() - - self.text_config = text_config - self.vision_config = vision_config - self.mm_tokens_per_image = mm_tokens_per_image - self.boi_token_index = boi_token_index - self.eoi_token_index = eoi_token_index - self.image_token_index = image_token_index - self.initializer_range = initializer_range - if tie_word_embeddings is None: - tie_word_embeddings = getattr(self.text_config, "tie_word_embeddings", True) - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + mm_tokens_per_image: int = 256 + boi_token_index: int = 255_999 + eoi_token_index: int = 256_000 + image_token_index: int = 262_144 + initializer_range: float = 0.02 + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config["model_type"] = self.vision_config.get("model_type", "siglip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: + self.vision_config = CONFIG_MAPPING["siglip_vision_model"]() + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "gemma3_text") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["gemma3_text"]() + if kwargs.get("tie_word_embeddings") is None: + self.tie_word_embeddings = getattr(self.text_config, "tie_word_embeddings", True) + + super().__post_init__(**kwargs) __all__ = ["ShieldGemma2Config"] diff --git a/src/transformers/models/siglip/configuration_siglip.py b/src/transformers/models/siglip/configuration_siglip.py index 8e75c2827c80..5b8f577edd66 100644 --- a/src/transformers/models/siglip/configuration_siglip.py +++ b/src/transformers/models/siglip/configuration_siglip.py @@ -13,6 +13,8 @@ # limitations under the License. """Siglip model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="google/siglip-base-patch16-224") +@strict(accept_kwargs=True) class SiglipTextConfig(PreTrainedConfig): r""" Example: @@ -41,43 +44,29 @@ class SiglipTextConfig(PreTrainedConfig): model_type = "siglip_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=32000, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - max_position_embeddings=64, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - # This differs from `CLIPTokenizer`'s default and from openai/siglip - # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 - pad_token_id=1, - bos_token_id=49406, - eos_token_id=49407, - projection_size=None, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.attention_dropout = attention_dropout - self.projection_size = projection_size if projection_size is not None else hidden_size + vocab_size: int = 32000 + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + max_position_embeddings: int = 64 + hidden_act: str = "gelu_pytorch_tanh" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + # This differs from `CLIPTokenizer`'s default and from openai/siglip + # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 + pad_token_id: int | None = 1 + bos_token_id: int | None = 49406 + eos_token_id: int | list[int] | None = 49407 + projection_size: int | None = None + + def __post_init__(self, **kwargs): + self.projection_size = self.projection_size if self.projection_size is not None else self.hidden_size + super().__post_init__(**kwargs) @auto_docstring(checkpoint="google/siglip-base-patch16-224") +@strict(accept_kwargs=True) class SiglipVisionConfig(PreTrainedConfig): r""" Example: @@ -98,35 +87,20 @@ class SiglipVisionConfig(PreTrainedConfig): model_type = "siglip_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=16, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu_pytorch_tanh" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 @auto_docstring(checkpoint="google/siglip-base-patch16-224") +@strict(accept_kwargs=True) class SiglipConfig(PreTrainedConfig): r""" Example: @@ -156,24 +130,24 @@ class SiglipConfig(PreTrainedConfig): model_type = "siglip" sub_configs = {"text_config": SiglipTextConfig, "vision_config": SiglipVisionConfig} - def __init__(self, text_config=None, vision_config=None, **kwargs): - if text_config is None: - text_config = SiglipTextConfig() + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + initializer_factor: float = 1.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = SiglipTextConfig() logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = SiglipTextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = SiglipTextConfig(**self.text_config) - if vision_config is None: - vision_config = SiglipVisionConfig() + if self.vision_config is None: + self.vision_config = SiglipVisionConfig() logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = SiglipVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - self.initializer_factor = 1.0 + elif isinstance(self.vision_config, dict): + self.vision_config = SiglipVisionConfig(**self.vision_config) - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["SiglipConfig", "SiglipTextConfig", "SiglipVisionConfig"] diff --git a/src/transformers/models/siglip2/configuration_siglip2.py b/src/transformers/models/siglip2/configuration_siglip2.py index 7e81430bca5c..8c2d683aace9 100644 --- a/src/transformers/models/siglip2/configuration_siglip2.py +++ b/src/transformers/models/siglip2/configuration_siglip2.py @@ -18,6 +18,9 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -26,6 +29,7 @@ @auto_docstring(checkpoint="google/siglip2-base-patch16-naflex") +@strict(accept_kwargs=True) class Siglip2TextConfig(PreTrainedConfig): r""" Example: @@ -46,43 +50,29 @@ class Siglip2TextConfig(PreTrainedConfig): model_type = "siglip2_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=32000, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - max_position_embeddings=64, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - # This differs from `CLIPTokenizer`'s default and from openai/siglip2 - # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 - pad_token_id=1, - bos_token_id=49406, - eos_token_id=49407, - projection_size=None, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.attention_dropout = attention_dropout - self.projection_size = projection_size if projection_size is not None else hidden_size + vocab_size: int = 32000 + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + max_position_embeddings: int = 64 + hidden_act: str = "gelu_pytorch_tanh" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + # This differs from `CLIPTokenizer`'s default and from openai/siglip2 + # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 + pad_token_id: int | None = 1 + bos_token_id: int | None = 49406 + eos_token_id: int | list[int] | None = 49407 + projection_size: int | None = None + + def __post_init__(self, **kwargs): + self.projection_size = self.projection_size if self.projection_size is not None else self.hidden_size + super().__post_init__(**kwargs) @auto_docstring(checkpoint="google/siglip2-base-patch16-naflex") +@strict(accept_kwargs=True) class Siglip2VisionConfig(PreTrainedConfig): r""" num_patches (`int`, *optional*, defaults to 256): @@ -109,35 +99,21 @@ class Siglip2VisionConfig(PreTrainedConfig): model_type = "siglip2_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - num_patches=256, - patch_size=16, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.num_patches = num_patches + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu_pytorch_tanh" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + + num_patches: int = 256 @auto_docstring(checkpoint="google/siglip2-base-patch16-naflex") +@strict(accept_kwargs=True) class Siglip2Config(PreTrainedConfig): r""" Example: @@ -167,24 +143,24 @@ class Siglip2Config(PreTrainedConfig): model_type = "siglip2" sub_configs = {"text_config": Siglip2TextConfig, "vision_config": Siglip2VisionConfig} - def __init__(self, text_config=None, vision_config=None, **kwargs): - if text_config is None: - text_config = Siglip2TextConfig() + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + initializer_factor: float = 1.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = Siglip2TextConfig() logger.info("`text_config` is `None`. Initializing the `Siglip2TextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = Siglip2TextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = Siglip2TextConfig(**self.text_config) - if vision_config is None: - vision_config = Siglip2VisionConfig() + if self.vision_config is None: + self.vision_config = Siglip2VisionConfig() logger.info("`vision_config` is `None`. initializing the `Siglip2VisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = Siglip2VisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - self.initializer_factor = 1.0 + elif isinstance(self.vision_config, dict): + self.vision_config = Siglip2VisionConfig(**self.vision_config) - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Siglip2Config", "Siglip2TextConfig", "Siglip2VisionConfig"] diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py index 1aa067aded9f..ba6f55333fe9 100644 --- a/src/transformers/models/siglip2/modeling_siglip2.py +++ b/src/transformers/models/siglip2/modeling_siglip2.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from collections.abc import Callable from dataclasses import dataclass from typing import Any diff --git a/src/transformers/models/siglip2/modular_siglip2.py b/src/transformers/models/siglip2/modular_siglip2.py index 2b4daba2ca66..f2b4d64796cc 100644 --- a/src/transformers/models/siglip2/modular_siglip2.py +++ b/src/transformers/models/siglip2/modular_siglip2.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. + import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from tokenizers import normalizers from transformers.models.gemma.tokenization_gemma import GemmaTokenizer @@ -83,11 +85,13 @@ def __init__( @auto_docstring(checkpoint="google/siglip2-base-patch16-naflex") +@strict(accept_kwargs=True) class Siglip2TextConfig(SiglipTextConfig): pass @auto_docstring(checkpoint="google/siglip2-base-patch16-naflex") +@strict(accept_kwargs=True) class Siglip2VisionConfig(SiglipVisionConfig): r""" num_patches (`int`, *optional*, defaults to 256): @@ -111,26 +115,12 @@ class Siglip2VisionConfig(SiglipVisionConfig): >>> configuration = model.config ```""" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - num_patches=256, - patch_size=16, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - **kwargs, - ): - super().__init__(**kwargs) - self.num_patches = num_patches - del self.image_size + num_patches: int = 256 + image_size = AttributeError() @auto_docstring(checkpoint="google/siglip2-base-patch16-naflex") +@strict(accept_kwargs=True) class Siglip2Config(SiglipConfig): pass diff --git a/src/transformers/models/siglip2/tokenization_siglip2.py b/src/transformers/models/siglip2/tokenization_siglip2.py index 514838cbff2f..0a68d80965d8 100644 --- a/src/transformers/models/siglip2/tokenization_siglip2.py +++ b/src/transformers/models/siglip2/tokenization_siglip2.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers from tokenizers.models import BPE diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index 85c6daab52f2..2233b579c916 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -18,12 +18,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="HuggingFaceTB/SmolLM3-3B") +@strict(accept_kwargs=True) class SmolLM3Config(PreTrainedConfig): r""" no_rope_layers (`List[int]`, *optional*): @@ -67,84 +70,50 @@ class SmolLM3Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 128256, - hidden_size: int | None = 2048, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 36, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 4, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 128004, - bos_token_id: int | None = 128000, - eos_token_id: int | None = 128001, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - use_sliding_window: bool | None = False, - sliding_window: int | None = None, - no_rope_layers: int | None = None, - no_rope_layer_interval: int | None = 4, - layer_types: int | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.mlp_bias = mlp_bias - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - if no_rope_layers is None: + vocab_size: int = 128256 + hidden_size: int = 2048 + intermediate_size: int = 11008 + num_hidden_layers: int = 36 + num_attention_heads: int = 16 + num_key_value_heads: int | None = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 128004 + bos_token_id: int | None = 128000 + eos_token_id: int | list[int] | None = 128001 + rope_parameters: RopeParameters | dict | None = None + use_sliding_window: bool = False + sliding_window: int | None = None + no_rope_layers: list[int] | None = None + no_rope_layer_interval: int = 4 + layer_types: list[str] | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + mlp_bias: bool = False + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + if self.no_rope_layers is None: self.no_rope_layers = [ - int((layer_idx + 1) % no_rope_layer_interval != 0) for layer_idx in range(num_hidden_layers) + int((layer_idx + 1) % self.no_rope_layer_interval != 0) for layer_idx in range(self.num_hidden_layers) ] - else: - self.no_rope_layers = no_rope_layers - - self.no_rope_layer_interval = no_rope_layer_interval - # Update layer_types based on sliding window and NoPE pattern - if layer_types is None: - layer_types = [] - for layer_idx in range(num_hidden_layers): + if self.layer_types is None: + self.layer_types = [] + for layer_idx in range(self.num_hidden_layers): has_rope = self.no_rope_layers[layer_idx] - if use_sliding_window and sliding_window is not None and not has_rope: - layer_types.append("sliding_attention") + if self.use_sliding_window and self.sliding_window is not None and not has_rope: + self.layer_types.append("sliding_attention") else: - layer_types.append("full_attention") - - self.layer_types = layer_types - layer_type_validation(self.layer_types, self.num_hidden_layers) + self.layer_types.append("full_attention") - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["SmolLM3Config"] diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index 5dc614b347ea..d7ce821d7a95 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -15,9 +15,10 @@ from collections.abc import Callable import torch +from huggingface_hub.dataclasses import strict from ...cache_utils import Cache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS @@ -41,6 +42,7 @@ @auto_docstring(checkpoint="HuggingFaceTB/SmolLM3-3B") +@strict(accept_kwargs=True) class SmolLM3Config(PreTrainedConfig): r""" no_rope_layers (`List[int]`, *optional*): @@ -84,84 +86,50 @@ class SmolLM3Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 128256, - hidden_size: int | None = 2048, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 36, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 4, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 32768, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 128004, - bos_token_id: int | None = 128000, - eos_token_id: int | None = 128001, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - use_sliding_window: bool | None = False, - sliding_window: int | None = None, - no_rope_layers: int | None = None, - no_rope_layer_interval: int | None = 4, - layer_types: int | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.mlp_bias = mlp_bias - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - if no_rope_layers is None: + vocab_size: int = 128256 + hidden_size: int = 2048 + intermediate_size: int = 11008 + num_hidden_layers: int = 36 + num_attention_heads: int = 16 + num_key_value_heads: int | None = 4 + hidden_act: str = "silu" + max_position_embeddings: int = 32768 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 128004 + bos_token_id: int | None = 128000 + eos_token_id: int | list[int] | None = 128001 + rope_parameters: RopeParameters | dict | None = None + use_sliding_window: bool = False + sliding_window: int | None = None + no_rope_layers: list[int] | None = None + no_rope_layer_interval: int = 4 + layer_types: list[str] | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + mlp_bias: bool = False + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + if self.no_rope_layers is None: self.no_rope_layers = [ - int((layer_idx + 1) % no_rope_layer_interval != 0) for layer_idx in range(num_hidden_layers) + int((layer_idx + 1) % self.no_rope_layer_interval != 0) for layer_idx in range(self.num_hidden_layers) ] - else: - self.no_rope_layers = no_rope_layers - - self.no_rope_layer_interval = no_rope_layer_interval - # Update layer_types based on sliding window and NoPE pattern - if layer_types is None: - layer_types = [] - for layer_idx in range(num_hidden_layers): + if self.layer_types is None: + self.layer_types = [] + for layer_idx in range(self.num_hidden_layers): has_rope = self.no_rope_layers[layer_idx] - if use_sliding_window and sliding_window is not None and not has_rope: - layer_types.append("sliding_attention") + if self.use_sliding_window and self.sliding_window is not None and not has_rope: + self.layer_types.append("sliding_attention") else: - layer_types.append("full_attention") - - self.layer_types = layer_types - layer_type_validation(self.layer_types, self.num_hidden_layers) + self.layer_types.append("full_attention") - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) class SmolLM3RotaryEmbedding(Qwen2RotaryEmbedding): diff --git a/src/transformers/models/smolvlm/configuration_smolvlm.py b/src/transformers/models/smolvlm/configuration_smolvlm.py index 8da163ca7c69..720b5d260be5 100644 --- a/src/transformers/models/smolvlm/configuration_smolvlm.py +++ b/src/transformers/models/smolvlm/configuration_smolvlm.py @@ -19,6 +19,9 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -28,6 +31,7 @@ @auto_docstring(checkpoint="HuggingFaceTB/SmolVLM2-2.2B-Instruct") +@strict(accept_kwargs=True) class SmolVLMVisionConfig(PreTrainedConfig): r""" Example: @@ -49,37 +53,21 @@ class SmolVLMVisionConfig(PreTrainedConfig): model_type = "smolvlm_vision" base_config_key = "vision_config" - def __init__( - self, - hidden_size=1152, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=16, - num_channels=3, - image_size=224, - patch_size=32, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_range = initializer_range + hidden_size: int = 1152 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 16 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 32 + hidden_act: str = "gelu_pytorch_tanh" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 @auto_docstring(checkpoint="HuggingFaceTB/SmolVLM2-2.2B-Instruct") +@strict(accept_kwargs=True) class SmolVLMConfig(PreTrainedConfig): r""" scale_factor (`int`, *optional*, defaults to 2): @@ -99,42 +87,32 @@ class SmolVLMConfig(PreTrainedConfig): model_type = "smolvlm" sub_configs = {"text_config": AutoConfig, "vision_config": SmolVLMVisionConfig} - def __init__( - self, - use_cache=True, - image_token_id=128257, - tie_word_embeddings=False, - vision_config=None, - text_config=None, - scale_factor=2, - pad_token_id=128_002, - **kwargs, - ): - self.image_token_id = image_token_id - self.use_cache = use_cache - self.tie_word_embeddings = tie_word_embeddings - - if vision_config is None: + use_cache: bool = True + image_token_id: int = 128257 + tie_word_embeddings: bool = False + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + scale_factor: int = 2 + pad_token_id: int | None = 128_002 + + def __post_init__(self, **kwargs): + if self.vision_config is None: self.vision_config = SmolVLMVisionConfig() logger.info("vision_config is None, using default vision config") - elif isinstance(vision_config, dict): - self.vision_config = SmolVLMVisionConfig(**vision_config) - elif isinstance(vision_config, SmolVLMVisionConfig): - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - logger.info("text_config is None, using default text config") - text_config = CONFIG_MAPPING["llama"]( + elif isinstance(self.vision_config, dict): + self.vision_config = SmolVLMVisionConfig(**self.vision_config) + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + logger.info("text_config is None, using default Llama text config") + self.text_config = CONFIG_MAPPING["llama"]( rms_norm_eps=1e-5, - pad_token_id=pad_token_id, + pad_token_id=self.pad_token_id, ) - self.text_config = text_config - self.scale_factor = scale_factor - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["SmolVLMVisionConfig", "SmolVLMConfig"] diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py index c86beab858a2..2f60b44c29ee 100644 --- a/src/transformers/models/smolvlm/image_processing_smolvlm.py +++ b/src/transformers/models/smolvlm/image_processing_smolvlm.py @@ -19,6 +19,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import math from collections.abc import Iterable diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py index 59ba2fc1f154..a7f9db595656 100644 --- a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py +++ b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py @@ -19,6 +19,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import math from typing import Optional diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py index 1fe247de23af..493c7630587a 100644 --- a/src/transformers/models/smolvlm/modeling_smolvlm.py +++ b/src/transformers/models/smolvlm/modeling_smolvlm.py @@ -19,6 +19,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from collections.abc import Callable from dataclasses import dataclass diff --git a/src/transformers/models/smolvlm/modular_smolvlm.py b/src/transformers/models/smolvlm/modular_smolvlm.py index 95f8164c60a0..91d9f2038062 100644 --- a/src/transformers/models/smolvlm/modular_smolvlm.py +++ b/src/transformers/models/smolvlm/modular_smolvlm.py @@ -13,7 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. + import torch +from huggingface_hub.dataclasses import strict from torch import nn from ...cache_utils import Cache, DynamicCache @@ -38,6 +40,7 @@ @auto_docstring(checkpoint="HuggingFaceTB/SmolVLM2-2.2B-Instruct") +@strict(accept_kwargs=True) class SmolVLMVisionConfig(Idefics3VisionConfig): r""" Example: @@ -68,6 +71,7 @@ class SmolVLMVisionTransformer(Idefics3VisionTransformer): @auto_docstring(checkpoint="HuggingFaceTB/SmolVLM2-2.2B-Instruct") +@strict(accept_kwargs=True) class SmolVLMConfig(Idefics3Config): r""" scale_factor (`int`, *optional*, defaults to 2): diff --git a/src/transformers/models/solar_open/configuration_solar_open.py b/src/transformers/models/solar_open/configuration_solar_open.py index bb102656a21f..80f7d216dd3f 100644 --- a/src/transformers/models/solar_open/configuration_solar_open.py +++ b/src/transformers/models/solar_open/configuration_solar_open.py @@ -18,12 +18,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="upstage/Solar-Open-100B") +@strict(accept_kwargs=True) class SolarOpenConfig(PreTrainedConfig): r""" n_group (`int`, *optional*, defaults to 1): @@ -51,74 +54,39 @@ class SolarOpenConfig(PreTrainedConfig): attribute_map = { "num_local_experts": "n_routed_experts", } + + vocab_size: int = 196608 + hidden_size: int = 4096 + num_hidden_layers: int = 48 + num_attention_heads: int = 64 + num_key_value_heads: int = 8 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: float | int = 0.0 + moe_intermediate_size: int = 1280 + num_experts_per_tok: int = 8 + n_shared_experts: int = 1 + n_routed_experts: int = 128 + routed_scaling_factor: float = 1.0 + n_group: int = 1 + topk_group: int = 1 + norm_topk_prob: bool = True + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + pad_token_id: int | None = None default_theta = 1_000_000.0 + head_dim: int = 128 - def __init__( - self, - vocab_size: int = 196608, - hidden_size: int = 4096, - moe_intermediate_size: int = 1280, - num_hidden_layers: int = 48, - num_attention_heads: int = 64, - num_key_value_heads: int = 8, - n_shared_experts: int = 1, - n_routed_experts: int = 128, - head_dim: int = 128, - hidden_act: str = "silu", - max_position_embeddings: int = 131072, - initializer_range: float = 0.02, - rms_norm_eps: int = 1e-5, - use_cache: bool = True, - tie_word_embeddings: bool = False, - rope_parameters: RopeParameters | None = None, - attention_bias: bool = False, - attention_dropout: float = 0.0, - num_experts_per_tok: int = 8, - routed_scaling_factor: float = 1.0, - n_group: int = 1, - topk_group: int = 1, - norm_topk_prob: bool = True, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - pad_token_id: int | None = None, - **kwargs, - ): - # Default partial_rotary_factor to 1.0 (instead of 0.5 in Glm4MoeConfig). - # `setdefault` ensures this value is not overridden by subsequent calls. - # This workaround is required due to modular inheritance limitations. + def __post_init__(self, **kwargs): kwargs.setdefault("partial_rotary_factor", 1.0) - self.head_dim = head_dim - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC - - # MoE arguments - self.moe_intermediate_size = moe_intermediate_size - self.num_experts_per_tok = num_experts_per_tok - self.n_group = n_group - self.topk_group = topk_group - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.routed_scaling_factor = routed_scaling_factor - self.norm_topk_prob = norm_topk_prob - self.tie_word_embeddings = tie_word_embeddings - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["SolarOpenConfig"] diff --git a/src/transformers/models/solar_open/modeling_solar_open.py b/src/transformers/models/solar_open/modeling_solar_open.py index 4be8c5c85288..dfa30292455f 100644 --- a/src/transformers/models/solar_open/modeling_solar_open.py +++ b/src/transformers/models/solar_open/modeling_solar_open.py @@ -17,7 +17,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from collections.abc import Callable from typing import Optional diff --git a/src/transformers/models/solar_open/modular_solar_open.py b/src/transformers/models/solar_open/modular_solar_open.py index e011b65d83cb..a478f64067b2 100644 --- a/src/transformers/models/solar_open/modular_solar_open.py +++ b/src/transformers/models/solar_open/modular_solar_open.py @@ -13,9 +13,9 @@ # limitations under the License. """PyTorch SolarOpen model.""" +from huggingface_hub.dataclasses import strict from torch import nn -from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring, logging from ..glm4_moe.configuration_glm4_moe import Glm4MoeConfig from ..glm4_moe.modeling_glm4_moe import ( @@ -32,6 +32,7 @@ @auto_docstring(checkpoint="upstage/Solar-Open-100B") +@strict(accept_kwargs=True) class SolarOpenConfig(Glm4MoeConfig): r""" n_group (`int`, *optional*, defaults to 1): @@ -52,75 +53,19 @@ class SolarOpenConfig(Glm4MoeConfig): "layers.*.mlp.experts": "moe_tp_experts", } - def __init__( - self, - vocab_size: int = 196608, - hidden_size: int = 4096, - moe_intermediate_size: int = 1280, - num_hidden_layers: int = 48, - num_attention_heads: int = 64, - num_key_value_heads: int = 8, - n_shared_experts: int = 1, - n_routed_experts: int = 128, - head_dim: int = 128, - hidden_act: str = "silu", - max_position_embeddings: int = 131072, - initializer_range: float = 0.02, - rms_norm_eps: int = 1e-5, - use_cache: bool = True, - tie_word_embeddings: bool = False, - rope_parameters: RopeParameters | None = None, - attention_bias: bool = False, - attention_dropout: float = 0.0, - num_experts_per_tok: int = 8, - routed_scaling_factor: float = 1.0, - n_group: int = 1, - topk_group: int = 1, - norm_topk_prob: bool = True, - bos_token_id: int | None = None, - eos_token_id: int | None = None, - pad_token_id: int | None = None, - **kwargs, - ): - # Default partial_rotary_factor to 1.0 (instead of 0.5 in Glm4MoeConfig). - # `setdefault` ensures this value is not overridden by subsequent calls. - # This workaround is required due to modular inheritance limitations. + vocab_size: int = 196608 + moe_intermediate_size: int = 1280 + num_hidden_layers: int = 48 + num_attention_heads: int = 64 + head_dim: int = 128 + num_experts_per_tok: int = 8 + intermediate_size = AttributeError() + first_k_dense_replace = AttributeError() + use_qk_norm = AttributeError() + + def __post_init__(self, **kwargs): kwargs.setdefault("partial_rotary_factor", 1.0) - self.head_dim = head_dim - - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - moe_hidden_size=moe_intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - n_shared_experts=n_shared_experts, - n_routed_experts=n_routed_experts, - head_dim=head_dim, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, - tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - num_experts_per_tok=num_experts_per_tok, - routed_scaling_factor=routed_scaling_factor, - n_group=n_group, - topk_group=topk_group, - norm_topk_prob=norm_topk_prob, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - **kwargs, - ) - - del self.intermediate_size - del self.first_k_dense_replace - del self.use_qk_norm + super().__post_init__(**kwargs) class SolarOpenDecoderLayer(LlamaDecoderLayer): diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py index 4310acdf5da4..120006d4b8ea 100644 --- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py @@ -14,6 +14,8 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto.configuration_auto import AutoConfig @@ -23,6 +25,7 @@ @auto_docstring(checkpoint="") +@strict(accept_kwargs=True) class SpeechEncoderDecoderConfig(PreTrainedConfig): r""" kwargs (*optional*): @@ -65,8 +68,9 @@ class SpeechEncoderDecoderConfig(PreTrainedConfig): sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} has_no_defaults_at_init = True - def __init__(self, **kwargs): - super().__init__(**kwargs) + is_encoder_decoder: int | None = True + + def __post_init__(self, **kwargs): if "encoder" not in kwargs or "decoder" not in kwargs: raise ValueError( f"A configuration of type {self.model_type} cannot be instantiated because not both `encoder` and" @@ -80,7 +84,7 @@ def __init__(self, **kwargs): self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config) self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config) - self.is_encoder_decoder = True + super().__post_init__(**kwargs) @classmethod def from_encoder_decoder_configs( diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index 957e3a7545d7..638c1f7e838c 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -388,7 +388,7 @@ def forward( >>> loss = model(input_values, labels=labels).loss >>> loss.backward() ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")} diff --git a/src/transformers/models/speech_to_text/configuration_speech_to_text.py b/src/transformers/models/speech_to_text/configuration_speech_to_text.py index f3e7de727103..176954dc7ae9 100644 --- a/src/transformers/models/speech_to_text/configuration_speech_to_text.py +++ b/src/transformers/models/speech_to_text/configuration_speech_to_text.py @@ -13,14 +13,14 @@ # limitations under the License. """Speech2Text model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/s2t-small-librispeech-asr") +@strict(accept_kwargs=True) class Speech2TextConfig(PreTrainedConfig): r""" max_source_positions (`int`, *optional*, defaults to 6000): @@ -57,68 +57,45 @@ class Speech2TextConfig(PreTrainedConfig): model_type = "speech_to_text" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} - - def __init__( - self, - vocab_size=10000, - encoder_layers=12, - encoder_ffn_dim=2048, - encoder_attention_heads=4, - decoder_layers=6, - decoder_ffn_dim=2048, - decoder_attention_heads=4, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - use_cache=True, - is_encoder_decoder=True, - activation_function="relu", - d_model=256, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - decoder_start_token_id=2, - scale_embedding=True, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - max_source_positions=6000, - max_target_positions=1024, - num_conv_layers=2, - conv_kernel_sizes=(5, 5), - conv_channels=1024, - input_feat_per_channel=80, - input_channels=1, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.max_source_positions = max_source_positions - self.max_target_positions = max_target_positions - self.num_conv_layers = num_conv_layers - self.conv_kernel_sizes = list(conv_kernel_sizes) - self.conv_channels = conv_channels - self.input_feat_per_channel = input_feat_per_channel - self.input_channels = input_channels - + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", + } + + vocab_size: int = 10000 + encoder_layers: int = 12 + encoder_ffn_dim: int = 2048 + encoder_attention_heads: int = 4 + decoder_layers: int = 6 + decoder_ffn_dim: int = 2048 + decoder_attention_heads: int = 4 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "relu" + d_model: int = 256 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + decoder_start_token_id: int = 2 + scale_embedding: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + max_source_positions: int = 6000 + max_target_positions: int = 1024 + num_conv_layers: int = 2 + conv_kernel_sizes: list[int] | tuple[int, ...] = (5, 5) + conv_channels: int = 1024 + input_feat_per_channel: int = 80 + input_channels: int = 1 + tie_word_embeddings: bool = True + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if len(self.conv_kernel_sizes) != self.num_conv_layers: raise ValueError( "Configuration for convolutional module is incorrect. " @@ -127,12 +104,5 @@ def __init__( f"`config.num_conv_layers = {self.num_conv_layers}`." ) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - __all__ = ["Speech2TextConfig"] diff --git a/src/transformers/models/speecht5/configuration_speecht5.py b/src/transformers/models/speecht5/configuration_speecht5.py index e0d984756630..22ca4ec5d0c3 100644 --- a/src/transformers/models/speecht5/configuration_speecht5.py +++ b/src/transformers/models/speecht5/configuration_speecht5.py @@ -16,14 +16,14 @@ import functools import operator -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/speecht5_asr") +@strict(accept_kwargs=True) class SpeechT5Config(PreTrainedConfig): r""" positional_dropout (`float`, *optional*, defaults to 0.1): @@ -139,98 +139,71 @@ class SpeechT5Config(PreTrainedConfig): model_type = "speecht5" attribute_map = {"num_attention_heads": "encoder_attention_heads", "num_hidden_layers": "encoder_layers"} - def __init__( - self, - vocab_size=81, - hidden_size=768, - encoder_layers=12, - encoder_attention_heads=12, - encoder_ffn_dim=3072, - encoder_layerdrop=0.1, - decoder_layers=6, - decoder_ffn_dim=3072, - decoder_attention_heads=12, - decoder_layerdrop=0.1, - hidden_act="gelu", - positional_dropout=0.1, - hidden_dropout=0.1, - attention_dropout=0.1, - activation_dropout=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - scale_embedding=False, - feat_extract_norm="group", - feat_proj_dropout=0.0, - feat_extract_activation="gelu", - conv_dim=(512, 512, 512, 512, 512, 512, 512), - conv_stride=(5, 2, 2, 2, 2, 2, 2), - conv_kernel=(10, 3, 3, 3, 3, 2, 2), - conv_bias=False, - num_conv_pos_embeddings=128, - num_conv_pos_embedding_groups=16, - apply_spec_augment=True, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - mask_feature_min_masks=0, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - decoder_start_token_id=2, - num_mel_bins=80, - speech_decoder_prenet_layers=2, - speech_decoder_prenet_units=256, - speech_decoder_prenet_dropout=0.5, - speaker_embedding_dim=512, - speech_decoder_postnet_layers=5, - speech_decoder_postnet_units=256, - speech_decoder_postnet_kernel=5, - speech_decoder_postnet_dropout=0.5, - reduction_factor=2, - max_speech_positions=4000, - max_text_positions=450, - encoder_max_relative_position=160, - use_guided_attention_loss=True, - guided_attention_loss_num_heads=2, - guided_attention_loss_sigma=0.4, - guided_attention_loss_scale=10.0, - use_cache=True, - is_encoder_decoder=True, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.encoder_layers = encoder_layers - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_attention_heads = encoder_attention_heads - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layers = decoder_layers - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_attention_heads = decoder_attention_heads - self.decoder_layerdrop = decoder_layerdrop - self.hidden_act = hidden_act - self.positional_dropout = positional_dropout - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.scale_embedding = scale_embedding - - self.feat_extract_norm = feat_extract_norm - self.feat_proj_dropout = feat_proj_dropout - self.feat_extract_activation = feat_extract_activation - self.conv_dim = list(conv_dim) - self.conv_stride = list(conv_stride) - self.conv_kernel = list(conv_kernel) - self.conv_bias = conv_bias - self.num_conv_pos_embeddings = num_conv_pos_embeddings - self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + vocab_size: int = 81 + hidden_size: int = 768 + encoder_layers: int = 12 + encoder_attention_heads: int = 12 + encoder_ffn_dim: int = 3072 + encoder_layerdrop: float | int = 0.1 + decoder_layers: int = 6 + decoder_ffn_dim: int = 3072 + decoder_attention_heads: int = 12 + decoder_layerdrop: float | int = 0.1 + hidden_act: str = "gelu" + positional_dropout: float | int = 0.1 + hidden_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + scale_embedding: bool = False + feat_extract_norm: str = "group" + feat_proj_dropout: float | int = 0.0 + feat_extract_activation: str = "gelu" + conv_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 512, 512, 512) + conv_stride: list[int] | tuple[int, ...] = (5, 2, 2, 2, 2, 2, 2) + conv_kernel: list[int] | tuple[int, ...] = (10, 3, 3, 3, 3, 2, 2) + conv_bias: bool = False + num_conv_pos_embeddings: int = 128 + num_conv_pos_embedding_groups: int = 16 + apply_spec_augment: bool = True + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + mask_feature_min_masks: int = 0 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + decoder_start_token_id: int | None = 2 + num_mel_bins: int = 80 + speech_decoder_prenet_layers: int = 2 + speech_decoder_prenet_units: int = 256 + speech_decoder_prenet_dropout: float | int = 0.5 + speaker_embedding_dim: int = 512 + speech_decoder_postnet_layers: int = 5 + speech_decoder_postnet_units: int = 256 + speech_decoder_postnet_kernel: int = 5 + speech_decoder_postnet_dropout: float | int = 0.5 + reduction_factor: int = 2 + max_speech_positions: int = 4000 + max_text_positions: int = 450 + encoder_max_relative_position: int = 160 + use_guided_attention_loss: bool = True + guided_attention_loss_num_heads: int = 2 + guided_attention_loss_sigma: float = 0.4 + guided_attention_loss_scale: float = 10.0 + use_cache: bool = True + is_encoder_decoder: bool = True + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if ( (len(self.conv_stride) != self.num_feat_extract_layers) or (len(self.conv_kernel) != self.num_feat_extract_layers) @@ -243,51 +216,12 @@ def __init__( f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." ) - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.apply_spec_augment = apply_spec_augment - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - self.mask_feature_min_masks = mask_feature_min_masks - - self.num_mel_bins = num_mel_bins - self.speech_decoder_prenet_layers = speech_decoder_prenet_layers - self.speech_decoder_prenet_units = speech_decoder_prenet_units - self.speech_decoder_prenet_dropout = speech_decoder_prenet_dropout - self.speaker_embedding_dim = speaker_embedding_dim - - self.speech_decoder_postnet_layers = speech_decoder_postnet_layers - self.speech_decoder_postnet_units = speech_decoder_postnet_units - self.speech_decoder_postnet_kernel = speech_decoder_postnet_kernel - self.speech_decoder_postnet_dropout = speech_decoder_postnet_dropout - self.reduction_factor = reduction_factor - - self.max_speech_positions = max_speech_positions - self.max_text_positions = max_text_positions - self.encoder_max_relative_position = encoder_max_relative_position - - self.use_guided_attention_loss = use_guided_attention_loss - self.guided_attention_loss_num_heads = guided_attention_loss_num_heads - self.guided_attention_loss_sigma = guided_attention_loss_sigma - self.guided_attention_loss_scale = guided_attention_loss_scale - - self.use_cache = use_cache - self.is_encoder_decoder = is_encoder_decoder - self.tie_word_embeddings = tie_word_embeddings - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - def inputs_to_logits_ratio(self): return functools.reduce(operator.mul, self.conv_stride, 1) @auto_docstring(checkpoint="microsoft/speecht5_asr") +@strict(accept_kwargs=True) class SpeechT5HifiGanConfig(PreTrainedConfig): r""" model_in_dim (`int`, *optional*, defaults to 80): @@ -332,31 +266,16 @@ class SpeechT5HifiGanConfig(PreTrainedConfig): model_type = "hifigan" - def __init__( - self, - model_in_dim=80, - sampling_rate=16000, - upsample_initial_channel=512, - upsample_rates=[4, 4, 4, 4], - upsample_kernel_sizes=[8, 8, 8, 8], - resblock_kernel_sizes=[3, 7, 11], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], - initializer_range=0.01, - leaky_relu_slope=0.1, - normalize_before=True, - **kwargs, - ): - self.model_in_dim = model_in_dim - self.sampling_rate = sampling_rate - self.upsample_initial_channel = upsample_initial_channel - self.upsample_rates = upsample_rates - self.upsample_kernel_sizes = upsample_kernel_sizes - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.initializer_range = initializer_range - self.leaky_relu_slope = leaky_relu_slope - self.normalize_before = normalize_before - super().__init__(**kwargs) + model_in_dim: int = 80 + sampling_rate: int = 16000 + upsample_initial_channel: int = 512 + upsample_rates: list[int] | tuple[int, ...] = (4, 4, 4, 4) + upsample_kernel_sizes: list[int] | tuple[int, ...] = (8, 8, 8, 8) + resblock_kernel_sizes: list[int] | tuple[int, ...] = (3, 7, 11) + resblock_dilation_sizes: list | tuple = ((1, 3, 5), (1, 3, 5), (1, 3, 5)) + initializer_range: float = 0.01 + leaky_relu_slope: float = 0.1 + normalize_before: bool = True __all__ = ["SpeechT5Config", "SpeechT5HifiGanConfig"] diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index 9e3e0eeeb49a..5980b4093829 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -1286,7 +1286,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict attention_mask = create_bidirectional_mask( config=self.config, @@ -1530,7 +1530,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if self.gradient_checkpointing and self.training: if use_cache: @@ -1967,7 +1967,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -2153,7 +2153,7 @@ def forward( 19.68 ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: if decoder_input_ids is None: @@ -2448,7 +2448,7 @@ def forward( torch.Size([15872]) ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: if decoder_input_values is None: @@ -2794,7 +2794,7 @@ def forward( torch.Size([77824]) ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: if decoder_input_values is None: diff --git a/src/transformers/models/splinter/configuration_splinter.py b/src/transformers/models/splinter/configuration_splinter.py index 5764293ba739..f4a03a436173 100644 --- a/src/transformers/models/splinter/configuration_splinter.py +++ b/src/transformers/models/splinter/configuration_splinter.py @@ -13,14 +13,14 @@ # limitations under the License. """Splinter model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="tau/splinter-base") +@strict(accept_kwargs=True) class SplinterConfig(PreTrainedConfig): r""" question_token_id (`int`, *optional*, defaults to 104): @@ -43,44 +43,22 @@ class SplinterConfig(PreTrainedConfig): model_type = "splinter" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=None, - eos_token_id=None, - question_token_id=104, - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.type_vocab_size = type_vocab_size - self.layer_norm_eps = layer_norm_eps - self.question_token_id = question_token_id + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + question_token_id: int = 104 __all__ = ["SplinterConfig"] diff --git a/src/transformers/models/squeezebert/configuration_squeezebert.py b/src/transformers/models/squeezebert/configuration_squeezebert.py index ec8b57a12fde..eed8483e85e7 100644 --- a/src/transformers/models/squeezebert/configuration_squeezebert.py +++ b/src/transformers/models/squeezebert/configuration_squeezebert.py @@ -13,14 +13,14 @@ # limitations under the License. """SqueezeBERT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="squeezebert/squeezebert-uncased") +@strict(accept_kwargs=True) class SqueezeBertConfig(PreTrainedConfig): r""" q_groups (`int`, *optional*, defaults to 4): @@ -54,58 +54,29 @@ class SqueezeBertConfig(PreTrainedConfig): model_type = "squeezebert" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=None, - eos_token_id=None, - embedding_size=768, - q_groups=4, - k_groups=4, - v_groups=4, - post_attention_groups=1, - intermediate_groups=4, - output_groups=4, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.embedding_size = embedding_size - self.q_groups = q_groups - self.k_groups = k_groups - self.v_groups = v_groups - self.post_attention_groups = post_attention_groups - self.intermediate_groups = intermediate_groups - self.output_groups = output_groups + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + embedding_size: int = 768 + q_groups: int = 4 + k_groups: int = 4 + v_groups: int = 4 + post_attention_groups: int = 1 + intermediate_groups: int = 4 + output_groups: int = 4 + tie_word_embeddings: bool = True __all__ = ["SqueezeBertConfig"] diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py index 08d8fd48cdf9..5bf2890bcc9e 100644 --- a/src/transformers/models/squeezebert/modeling_squeezebert.py +++ b/src/transformers/models/squeezebert/modeling_squeezebert.py @@ -449,7 +449,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -537,7 +537,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, @@ -609,7 +609,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, @@ -718,7 +718,7 @@ def forward( num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -796,7 +796,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, @@ -858,7 +858,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index bdf8f1c38e49..b174d34b00b2 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -13,15 +13,15 @@ # limitations under the License. """StableLM model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="stabilityai/stablelm-3b-4e1t") +@strict(accept_kwargs=True) class StableLmConfig(PreTrainedConfig): r""" use_parallel_residual (`bool`, *optional*, defaults to `False`): @@ -42,57 +42,31 @@ class StableLmConfig(PreTrainedConfig): model_type = "stablelm" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 50304, - intermediate_size: int | None = 6912, - hidden_size: int | None = 2560, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = 32, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.02, - layer_norm_eps: float | None = 1.0e-5, - use_cache: bool | None = True, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - use_qkv_bias: bool | None = False, - qk_layernorm: bool | None = False, - use_parallel_residual: bool | None = False, - hidden_dropout: float | None = 0.0, - attention_dropout: float | None = 0.0, - bos_token_id: int | None = 0, - eos_token_id: int | None = 0, - pad_token_id: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings + vocab_size: int = 50304 + intermediate_size: int = 6912 + hidden_size: int = 2560 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int = 32 + hidden_act: str = "silu" + max_position_embeddings: int = 4096 + initializer_range: float = 0.02 + layer_norm_eps: float = 1.0e-5 + use_cache: bool = True + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + use_qkv_bias: bool = False + qk_layernorm: bool = False + use_parallel_residual: bool = False + hidden_dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 0 + pad_token_id: int | None = None - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.use_qkv_bias = use_qkv_bias - self.qk_layernorm = qk_layernorm - self.use_parallel_residual = use_parallel_residual - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters + def __post_init__(self, **kwargs): kwargs.setdefault("partial_rotary_factor", 0.25) # assign default for BC - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["StableLmConfig"] diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index 78385e06e735..43111fc86d90 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -13,15 +13,15 @@ # limitations under the License. """Starcoder2 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="bigcode/starcoder2-7b") +@strict(accept_kwargs=True) class Starcoder2Config(PreTrainedConfig): r""" use_bias (`bool`, *optional*, defaults to `True`): @@ -57,54 +57,27 @@ class Starcoder2Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 49152, - hidden_size: int | None = 3072, - intermediate_size: int | None = 12288, - num_hidden_layers: int | None = 30, - num_attention_heads: int | None = 24, - num_key_value_heads: int | None = 2, - hidden_act: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 4096, - initializer_range: float | None = 0.018042, - norm_epsilon: int | None = 1e-5, - use_cache: bool | None = True, - bos_token_id: int | None = 50256, - eos_token_id: int | None = 50256, - pad_token_id: int | None = None, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - sliding_window: int | None = None, - attention_dropout: float | None = 0.0, - residual_dropout: float | None = 0.0, - embedding_dropout: float | None = 0.0, - use_bias: bool | None = True, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - self.use_bias = use_bias - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.norm_epsilon = norm_epsilon - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.residual_dropout = residual_dropout - self.embedding_dropout = embedding_dropout - self.rope_parameters = rope_parameters - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vocab_size: int = 49152 + hidden_size: int = 3072 + intermediate_size: int = 12288 + num_hidden_layers: int = 30 + num_attention_heads: int = 24 + num_key_value_heads: int = 2 + hidden_act: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 4096 + initializer_range: float = 0.018042 + norm_epsilon: float = 1e-5 + use_cache: bool = True + bos_token_id: int | None = 50256 + eos_token_id: int | list[int] | None = 50256 + pad_token_id: int | None = None + rope_parameters: RopeParameters | dict | None = None + sliding_window: int | None = None + attention_dropout: float | int = 0.0 + residual_dropout: float | int = 0.0 + embedding_dropout: float | int = 0.0 + use_bias: bool = True + tie_word_embeddings: bool = True __all__ = ["Starcoder2Config"] diff --git a/src/transformers/models/superglue/configuration_superglue.py b/src/transformers/models/superglue/configuration_superglue.py index f30e6c90c8ff..d2198ecf2179 100644 --- a/src/transformers/models/superglue/configuration_superglue.py +++ b/src/transformers/models/superglue/configuration_superglue.py @@ -11,20 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import TYPE_CHECKING + +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig -if TYPE_CHECKING: - from ..superpoint import SuperPointConfig - -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="magic-leap-community/superglue_indoor") +@strict(accept_kwargs=True) class SuperGlueConfig(PreTrainedConfig): r""" keypoint_detector_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SuperPointConfig`): @@ -56,51 +52,41 @@ class SuperGlueConfig(PreTrainedConfig): model_type = "superglue" sub_configs = {"keypoint_detector_config": AutoConfig} - def __init__( - self, - keypoint_detector_config: "SuperPointConfig" = None, - hidden_size: int = 256, - keypoint_encoder_sizes: list[int] | None = None, - gnn_layers_types: list[str] | None = None, - num_attention_heads: int = 4, - sinkhorn_iterations: int = 100, - matching_threshold: float = 0.0, - initializer_range: float = 0.02, - is_decoder=False, - **kwargs, - ): - self.gnn_layers_types = gnn_layers_types if gnn_layers_types is not None else ["self", "cross"] * 9 - # Check whether all gnn_layers_types are either 'self' or 'cross' - if not all(layer_type in ["self", "cross"] for layer_type in self.gnn_layers_types): - raise ValueError("All gnn_layers_types must be either 'self' or 'cross'") - - if hidden_size % num_attention_heads != 0: - raise ValueError("hidden_size % num_attention_heads is different from zero") - + keypoint_detector_config: dict | PreTrainedConfig | None = None + hidden_size: int = 256 + keypoint_encoder_sizes: list[int] | None = None + gnn_layers_types: list[str] | None = None + num_attention_heads: int = 4 + sinkhorn_iterations: int = 100 + matching_threshold: float = 0.0 + initializer_range: float = 0.02 + is_decoder: bool = False + attention_probs_dropout_prob: int | float = 0.0 + + def __post_init__(self, **kwargs): + self.gnn_layers_types = self.gnn_layers_types if self.gnn_layers_types is not None else ["self", "cross"] * 9 self.keypoint_encoder_sizes = ( - keypoint_encoder_sizes if keypoint_encoder_sizes is not None else [32, 64, 128, 256] + self.keypoint_encoder_sizes if self.keypoint_encoder_sizes is not None else [32, 64, 128, 256] ) - self.hidden_size = hidden_size - self.keypoint_encoder_sizes = keypoint_encoder_sizes - self.gnn_layers_types = gnn_layers_types - self.num_attention_heads = num_attention_heads - self.sinkhorn_iterations = sinkhorn_iterations - self.matching_threshold = matching_threshold - - if isinstance(keypoint_detector_config, dict): - keypoint_detector_config["model_type"] = keypoint_detector_config.get("model_type", "superpoint") - keypoint_detector_config = CONFIG_MAPPING[keypoint_detector_config["model_type"]]( - **keypoint_detector_config + + if isinstance(self.keypoint_detector_config, dict): + self.keypoint_detector_config["model_type"] = self.keypoint_detector_config.get("model_type", "superpoint") + self.keypoint_detector_config = CONFIG_MAPPING[self.keypoint_detector_config["model_type"]]( + **self.keypoint_detector_config ) - if keypoint_detector_config is None: - keypoint_detector_config = CONFIG_MAPPING["superpoint"]() + elif self.keypoint_detector_config is None: + self.keypoint_detector_config = CONFIG_MAPPING["superpoint"]() + + super().__post_init__(**kwargs) - self.keypoint_detector_config = keypoint_detector_config - self.initializer_range = initializer_range - self.attention_probs_dropout_prob = 0 - self.is_decoder = False + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + # Check whether all gnn_layers_types are either 'self' or 'cross' + if not all(layer_type in ["self", "cross"] for layer_type in self.gnn_layers_types): + raise ValueError("All gnn_layers_types must be either 'self' or 'cross'") - super().__init__(**kwargs) + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError("hidden_size % num_attention_heads is different from zero") __all__ = ["SuperGlueConfig"] diff --git a/src/transformers/models/superglue/modeling_superglue.py b/src/transformers/models/superglue/modeling_superglue.py index 5a4d31b77f97..4e9abde9b5cb 100644 --- a/src/transformers/models/superglue/modeling_superglue.py +++ b/src/transformers/models/superglue/modeling_superglue.py @@ -706,7 +706,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values.ndim != 5 or pixel_values.size(1) != 2: raise ValueError("Input must be a 5D tensor of shape (batch_size, 2, num_channels, height, width)") diff --git a/src/transformers/models/superpoint/configuration_superpoint.py b/src/transformers/models/superpoint/configuration_superpoint.py index 4258b786b7cd..7a7515fcd974 100644 --- a/src/transformers/models/superpoint/configuration_superpoint.py +++ b/src/transformers/models/superpoint/configuration_superpoint.py @@ -12,14 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="magic-leap-community/superpoint") +@strict(accept_kwargs=True) class SuperPointConfig(PreTrainedConfig): r""" encoder_hidden_sizes (`List`, *optional*, defaults to `[64, 64, 128, 128]`): @@ -49,30 +50,15 @@ class SuperPointConfig(PreTrainedConfig): model_type = "superpoint" - def __init__( - self, - encoder_hidden_sizes: list[int] = [64, 64, 128, 128], - decoder_hidden_size: int = 256, - keypoint_decoder_dim: int = 65, - descriptor_decoder_dim: int = 256, - keypoint_threshold: float = 0.005, - max_keypoints: int = -1, - nms_radius: int = 4, - border_removal_distance: int = 4, - initializer_range=0.02, - **kwargs, - ): - self.encoder_hidden_sizes = encoder_hidden_sizes - self.decoder_hidden_size = decoder_hidden_size - self.keypoint_decoder_dim = keypoint_decoder_dim - self.descriptor_decoder_dim = descriptor_decoder_dim - self.keypoint_threshold = keypoint_threshold - self.max_keypoints = max_keypoints - self.nms_radius = nms_radius - self.border_removal_distance = border_removal_distance - self.initializer_range = initializer_range - - super().__init__(**kwargs) + encoder_hidden_sizes: list[int] | tuple[int, ...] = (64, 64, 128, 128) + decoder_hidden_size: int = 256 + keypoint_decoder_dim: int = 65 + descriptor_decoder_dim: int = 256 + keypoint_threshold: float = 0.005 + max_keypoints: int = -1 + nms_radius: int = 4 + border_removal_distance: int = 4 + initializer_range: float = 0.02 __all__ = ["SuperPointConfig"] diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py index 615e55e56eb6..243315c7523c 100644 --- a/src/transformers/models/superpoint/modeling_superpoint.py +++ b/src/transformers/models/superpoint/modeling_superpoint.py @@ -406,7 +406,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict pixel_values = self.extract_one_channel_pixel_values(pixel_values) diff --git a/src/transformers/models/swiftformer/configuration_swiftformer.py b/src/transformers/models/swiftformer/configuration_swiftformer.py index be72000ad3cc..87dc52793fbe 100644 --- a/src/transformers/models/swiftformer/configuration_swiftformer.py +++ b/src/transformers/models/swiftformer/configuration_swiftformer.py @@ -13,14 +13,14 @@ # limitations under the License. """SwiftFormer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="MBZUAI/swiftformer-xs") +@strict(accept_kwargs=True) class SwiftFormerConfig(PreTrainedConfig): r""" embed_dims (`list[int]`, *optional*, defaults to `[48, 56, 112, 220]`): @@ -59,43 +59,22 @@ class SwiftFormerConfig(PreTrainedConfig): model_type = "swiftformer" - def __init__( - self, - image_size=224, - num_channels=3, - depths=[3, 3, 6, 4], - embed_dims=[48, 56, 112, 220], - mlp_ratio=4, - downsamples=[True, True, True, True], - hidden_act="gelu", - down_patch_size=3, - down_stride=2, - down_pad=1, - drop_path_rate=0.0, - drop_mlp_rate=0.0, - drop_conv_encoder_rate=0.0, - use_layer_scale=True, - layer_scale_init_value=1e-5, - batch_norm_eps=1e-5, - **kwargs, - ): - super().__init__(**kwargs) - self.image_size = image_size - self.num_channels = num_channels - self.depths = depths - self.embed_dims = embed_dims - self.mlp_ratio = mlp_ratio - self.downsamples = downsamples - self.hidden_act = hidden_act - self.down_patch_size = down_patch_size - self.down_stride = down_stride - self.down_pad = down_pad - self.drop_path_rate = drop_path_rate - self.drop_mlp_rate = drop_mlp_rate - self.drop_conv_encoder_rate = drop_conv_encoder_rate - self.use_layer_scale = use_layer_scale - self.layer_scale_init_value = layer_scale_init_value - self.batch_norm_eps = batch_norm_eps + image_size: int | list[int] | tuple[int, int] = 224 + num_channels: int = 3 + depths: list[int] | tuple[int, ...] = (3, 3, 6, 4) + embed_dims: list[int] | tuple[int, ...] = (48, 56, 112, 220) + mlp_ratio: int = 4 + downsamples: list[bool] | tuple[bool, ...] = (True, True, True, True) + hidden_act: str = "gelu" + down_patch_size: int | list[int] | tuple[int, int] = 3 + down_stride: int = 2 + down_pad: int = 1 + drop_path_rate: float = 0.0 + drop_mlp_rate: float = 0.0 + drop_conv_encoder_rate: float = 0.0 + use_layer_scale: bool = True + layer_scale_init_value: float = 1e-5 + batch_norm_eps: float = 1e-5 __all__ = ["SwiftFormerConfig"] diff --git a/src/transformers/models/swiftformer/modeling_swiftformer.py b/src/transformers/models/swiftformer/modeling_swiftformer.py index 0592ca33dd4a..f079c4423877 100644 --- a/src/transformers/models/swiftformer/modeling_swiftformer.py +++ b/src/transformers/models/swiftformer/modeling_swiftformer.py @@ -361,7 +361,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict all_hidden_states = (hidden_states,) if output_hidden_states else None @@ -435,7 +435,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -489,7 +489,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # run base model outputs = self.swiftformer( diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py index da7cadf1ca80..b22ea010d66b 100644 --- a/src/transformers/models/swin/configuration_swin.py +++ b/src/transformers/models/swin/configuration_swin.py @@ -13,15 +13,15 @@ # limitations under the License. """Swin Transformer model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/swin-tiny-patch4-window7-224") +@strict(accept_kwargs=True) class SwinConfig(BackboneConfigMixin, PreTrainedConfig): r""" depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`): @@ -55,54 +55,36 @@ class SwinConfig(BackboneConfigMixin, PreTrainedConfig): "num_hidden_layers": "num_layers", } - def __init__( - self, - image_size=224, - patch_size=4, - num_channels=3, - embed_dim=96, - depths=[2, 2, 6, 2], - num_heads=[3, 6, 12, 24], - window_size=7, - mlp_ratio=4.0, - qkv_bias=True, - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - drop_path_rate=0.1, - hidden_act="gelu", - use_absolute_embeddings=False, - initializer_range=0.02, - layer_norm_eps=1e-5, - encoder_stride=32, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.embed_dim = embed_dim - self.depths = depths - self.num_layers = len(depths) - self.num_heads = num_heads - self.window_size = window_size - self.mlp_ratio = mlp_ratio - self.qkv_bias = qkv_bias - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.drop_path_rate = drop_path_rate - self.hidden_act = hidden_act - self.use_absolute_embeddings = use_absolute_embeddings - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.encoder_stride = encoder_stride + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 4 + num_channels: int = 3 + embed_dim: int = 96 + depths: list[int] | tuple[int, ...] = (2, 2, 6, 2) + num_heads: list[int] | tuple[int, ...] = (3, 6, 12, 24) + window_size: int = 7 + mlp_ratio: float | int = 4.0 + qkv_bias: bool = True + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + drop_path_rate: float = 0.1 + hidden_act: str = "gelu" + use_absolute_embeddings: bool = False + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + encoder_stride: int = 32 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + + def __post_init__(self, **kwargs): + self.num_layers = len(self.depths) # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel # this indicates the channel dimension after the last stage of the model - self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.hidden_size = int(self.embed_dim * 2 ** (len(self.depths) - 1)) + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["SwinConfig"] diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index 488a0aa073f6..37a2f5f81f13 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -862,7 +862,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -970,7 +970,7 @@ def forward( >>> list(reconstructed_pixel_values.shape) [1, 3, 192, 192] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.swin( pixel_values, @@ -1063,7 +1063,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.swin( pixel_values, @@ -1156,7 +1156,7 @@ def forward( >>> list(feature_maps[-1].shape) [1, 768, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/swin2sr/configuration_swin2sr.py b/src/transformers/models/swin2sr/configuration_swin2sr.py index bea7a853819d..8fe0dd03a567 100644 --- a/src/transformers/models/swin2sr/configuration_swin2sr.py +++ b/src/transformers/models/swin2sr/configuration_swin2sr.py @@ -13,14 +13,14 @@ # limitations under the License. """Swin2SR Transformer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="caidas/swin2sr-classicalsr-x2-64") +@strict(accept_kwargs=True) class Swin2SRConfig(PreTrainedConfig): r""" num_channels_out (`int`, *optional*, defaults to `num_channels`): @@ -64,55 +64,32 @@ class Swin2SRConfig(PreTrainedConfig): "num_hidden_layers": "num_layers", } - def __init__( - self, - image_size=64, - patch_size=1, - num_channels=3, - num_channels_out=None, - embed_dim=180, - depths=[6, 6, 6, 6, 6, 6], - num_heads=[6, 6, 6, 6, 6, 6], - window_size=8, - mlp_ratio=2.0, - qkv_bias=True, - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - drop_path_rate=0.1, - hidden_act="gelu", - use_absolute_embeddings=False, - initializer_range=0.02, - layer_norm_eps=1e-5, - upscale=2, - img_range=1.0, - resi_connection="1conv", - upsampler="pixelshuffle", - **kwargs, - ): - super().__init__(**kwargs) - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.num_channels_out = num_channels if num_channels_out is None else num_channels_out - self.embed_dim = embed_dim - self.depths = depths - self.num_layers = len(depths) - self.num_heads = num_heads - self.window_size = window_size - self.mlp_ratio = mlp_ratio - self.qkv_bias = qkv_bias - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.drop_path_rate = drop_path_rate - self.hidden_act = hidden_act - self.use_absolute_embeddings = use_absolute_embeddings - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.upscale = upscale - self.img_range = img_range - self.resi_connection = resi_connection - self.upsampler = upsampler + image_size: int | list[int] | tuple[int, int] = 64 + patch_size: int | list[int] | tuple[int, int] = 1 + num_channels: int = 3 + num_channels_out: int | None = None + embed_dim: int = 180 + depths: list[int] | tuple[int, ...] = (6, 6, 6, 6, 6, 6) + num_heads: list[int] | tuple[int, ...] = (6, 6, 6, 6, 6, 6) + window_size: int = 8 + mlp_ratio: float = 2.0 + qkv_bias: bool = True + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + drop_path_rate: float = 0.1 + hidden_act: str = "gelu" + use_absolute_embeddings: bool = False + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + upscale: int = 2 + img_range: float = 1.0 + resi_connection: str = "1conv" + upsampler: str = "pixelshuffle" + + def __post_init__(self, **kwargs): + self.num_channels_out = self.num_channels if self.num_channels_out is None else self.num_channels_out + self.num_layers = len(self.depths) + super().__post_init__(**kwargs) __all__ = ["Swin2SRConfig"] diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py index 0d2d0eff07e5..da2eb0ab5ed8 100644 --- a/src/transformers/models/swin2sr/modeling_swin2sr.py +++ b/src/transformers/models/swin2sr/modeling_swin2sr.py @@ -773,7 +773,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict _, _, height, width = pixel_values.shape @@ -1017,7 +1017,7 @@ def forward( >>> output = (output * 255.0).round().astype(np.uint8) # float32 to uint8 >>> # you can visualize `output` with `Image.fromarray` ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict loss = None if labels is not None: diff --git a/src/transformers/models/swinv2/configuration_swinv2.py b/src/transformers/models/swinv2/configuration_swinv2.py index 4c01883f3709..9f2834f769db 100644 --- a/src/transformers/models/swinv2/configuration_swinv2.py +++ b/src/transformers/models/swinv2/configuration_swinv2.py @@ -13,15 +13,15 @@ # limitations under the License. """Swinv2 Transformer model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/swinv2-tiny-patch4-window8-256") +@strict(accept_kwargs=True) class Swinv2Config(BackboneConfigMixin, PreTrainedConfig): r""" window_size (`int`, *optional*, defaults to 7): @@ -53,56 +53,37 @@ class Swinv2Config(BackboneConfigMixin, PreTrainedConfig): "num_hidden_layers": "num_layers", } - def __init__( - self, - image_size=224, - patch_size=4, - num_channels=3, - embed_dim=96, - depths=[2, 2, 6, 2], - num_heads=[3, 6, 12, 24], - window_size=7, - pretrained_window_sizes=[0, 0, 0, 0], - mlp_ratio=4.0, - qkv_bias=True, - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - drop_path_rate=0.1, - hidden_act="gelu", - use_absolute_embeddings=False, - initializer_range=0.02, - layer_norm_eps=1e-5, - encoder_stride=32, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.embed_dim = embed_dim - self.depths = depths - self.num_layers = len(depths) - self.num_heads = num_heads - self.window_size = window_size - self.pretrained_window_sizes = pretrained_window_sizes - self.mlp_ratio = mlp_ratio - self.qkv_bias = qkv_bias - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.drop_path_rate = drop_path_rate - self.hidden_act = hidden_act - self.use_absolute_embeddings = use_absolute_embeddings - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.encoder_stride = encoder_stride - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 4 + num_channels: int = 3 + embed_dim: int = 96 + depths: list[int] | tuple[int, ...] = (2, 2, 6, 2) + num_heads: list[int] | tuple[int, ...] = (3, 6, 12, 24) + window_size: int = 7 + pretrained_window_sizes: list[int] | tuple[int, ...] = (0, 0, 0, 0) + mlp_ratio: float = 4.0 + qkv_bias: bool = True + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + drop_path_rate: float = 0.1 + hidden_act: str = "gelu" + use_absolute_embeddings: bool = False + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + encoder_stride: int = 32 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + + def __post_init__(self, **kwargs): + self.num_layers = len(self.depths) + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) # we set the hidden_size attribute in order to make Swinv2 work with VisionEncoderDecoderModel # this indicates the channel dimension after the last stage of the model - self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) + self.hidden_size = int(self.embed_dim * 2 ** (len(self.depths) - 1)) + super().__post_init__(**kwargs) __all__ = ["Swinv2Config"] diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py index ceda9c97a3e0..9188f95d0dc5 100644 --- a/src/transformers/models/swinv2/modeling_swinv2.py +++ b/src/transformers/models/swinv2/modeling_swinv2.py @@ -948,7 +948,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1058,7 +1058,7 @@ def forward( >>> list(reconstructed_pixel_values.shape) [1, 3, 256, 256] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.swinv2( pixel_values, @@ -1152,7 +1152,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.swinv2( pixel_values, @@ -1239,7 +1239,7 @@ def forward( >>> list(feature_maps[-1].shape) [1, 2048, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/switch_transformers/configuration_switch_transformers.py b/src/transformers/models/switch_transformers/configuration_switch_transformers.py index 5acd5de1a07f..abec0cf4aabe 100644 --- a/src/transformers/models/switch_transformers/configuration_switch_transformers.py +++ b/src/transformers/models/switch_transformers/configuration_switch_transformers.py @@ -13,14 +13,16 @@ # limitations under the License. """Switch Transformers model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from typing import Literal +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/switch-base-8") +@strict(accept_kwargs=True) class SwitchTransformersConfig(PreTrainedConfig): r""" num_sparse_encoder_layers (`int`, *optional*, defaults to 3): @@ -55,56 +57,43 @@ class SwitchTransformersConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"} - def __init__( - self, - vocab_size=32128, - d_model=768, - d_kv=64, - d_ff=2048, - expert_capacity=64, - num_layers=12, - num_sparse_encoder_layers=3, - num_decoder_layers=12, - num_sparse_decoder_layers=3, - num_heads=12, - num_experts=8, - router_bias=False, - router_jitter_noise=0.01, - router_dtype="float32", - router_ignore_padding_tokens=False, - relative_attention_num_buckets=32, - relative_attention_max_distance=128, - dropout_rate=0.1, - layer_norm_epsilon=1e-6, - router_z_loss_coef=0.001, - router_aux_loss_coef=0.001, - initializer_factor=1.0, - dense_act_fn="relu", - is_encoder_decoder=True, - add_router_probs=False, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=None, - tie_word_embeddings=True, - is_decoder=False, - add_cross_attention=False, - **kwargs, - ): - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.d_model = d_model - self.d_kv = d_kv - self.d_ff = d_ff - - self.num_sparse_encoder_layers = num_sparse_encoder_layers - - self.num_layers = num_layers + vocab_size: int = 32128 + d_model: int = 768 + d_kv: int = 64 + d_ff: int = 2048 + expert_capacity: int = 64 + num_layers: int = 12 + num_sparse_encoder_layers: int = 3 + num_decoder_layers: int | None = 12 + num_sparse_decoder_layers: int = 3 + num_heads: int = 12 + num_experts: int = 8 + router_bias: bool = False + router_jitter_noise: int | float = 0.01 + router_dtype: Literal["float32", "float16", "bfloat16"] = "float32" + router_ignore_padding_tokens: bool = False + relative_attention_num_buckets: int = 32 + relative_attention_max_distance: int = 128 + dropout_rate: float = 0.1 + layer_norm_epsilon: float = 1e-6 + router_z_loss_coef: float = 0.001 + router_aux_loss_coef: float = 0.001 + initializer_factor: float = 1.0 + dense_act_fn: str = "relu" + is_encoder_decoder: bool = True + add_router_probs: bool = False + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | None = 1 + bos_token_id: int | None = None + tie_word_embeddings: bool = True + is_decoder: bool = False + add_cross_attention: bool = False + + def __post_init__(self, **kwargs): self.num_decoder_layers = ( - num_decoder_layers if num_decoder_layers is not None else self.num_layers + self.num_decoder_layers if self.num_decoder_layers is not None else self.num_layers ) # default = symmetry - self.num_sparse_decoder_layers = num_sparse_decoder_layers # This tells us, each how many encoder layer we'll have to set a sparse layer. if self.num_sparse_encoder_layers > 0: @@ -118,34 +107,7 @@ def __init__( else: self.decoder_sparse_step = self.num_decoder_layers # HACK: this will create 0 sparse layers - self.num_heads = num_heads - self.num_experts = num_experts - self.expert_capacity = expert_capacity - self.router_bias = router_bias - self.router_jitter_noise = router_jitter_noise - if router_dtype not in ["float32", "float16", "bfloat16"]: - raise ValueError(f"`router_dtype` must be one of 'float32', 'float16' or 'bfloat16', got {router_dtype}") - self.router_dtype = router_dtype - - self.router_ignore_padding_tokens = router_ignore_padding_tokens - self.relative_attention_num_buckets = relative_attention_num_buckets - self.relative_attention_max_distance = relative_attention_max_distance - - self.dropout_rate = dropout_rate - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_factor = initializer_factor - self.use_cache = use_cache - self.add_router_probs = add_router_probs - - self.router_z_loss_coef = router_z_loss_coef - self.router_aux_loss_coef = router_aux_loss_coef - self.dense_act_fn = dense_act_fn - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__post_init__(**kwargs) __all__ = ["SwitchTransformersConfig"] diff --git a/src/transformers/models/t5/configuration_t5.py b/src/transformers/models/t5/configuration_t5.py index ca978c27a2ce..35aca7536d48 100644 --- a/src/transformers/models/t5/configuration_t5.py +++ b/src/transformers/models/t5/configuration_t5.py @@ -13,14 +13,14 @@ # limitations under the License. """T5 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google-t5/t5-small") +@strict(accept_kwargs=True) class T5Config(PreTrainedConfig): r""" relative_attention_num_buckets (`int`, *optional*, defaults to 32): @@ -41,63 +41,37 @@ class T5Config(PreTrainedConfig): "head_dim": "d_kv", } - def __init__( - self, - vocab_size=32128, - d_model=512, - d_kv=64, - d_ff=2048, - num_layers=6, - num_decoder_layers=None, - num_heads=8, - relative_attention_num_buckets=32, - relative_attention_max_distance=128, - dropout_rate=0.1, - layer_norm_epsilon=1e-6, - initializer_factor=1.0, - feed_forward_proj="relu", - is_encoder_decoder=True, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - classifier_dropout=0.0, - tie_word_embeddings=True, - is_decoder=False, - **kwargs, - ): - self.is_decoder = is_decoder - self.vocab_size = vocab_size - self.d_model = d_model - self.d_kv = d_kv - self.d_ff = d_ff - self.num_layers = num_layers + vocab_size: int = 32128 + d_model: int = 512 + d_kv: int = 64 + d_ff: int = 2048 + num_layers: int = 6 + num_decoder_layers: int | None = None + num_heads: int = 8 + relative_attention_num_buckets: int = 32 + relative_attention_max_distance: int = 128 + dropout_rate: float = 0.1 + layer_norm_epsilon: float = 1e-6 + initializer_factor: float = 1.0 + feed_forward_proj: str = "relu" + is_encoder_decoder: bool = True + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | None = 1 + classifier_dropout: float | int = 0.0 + is_decoder: bool = False + + def __post_init__(self, **kwargs): self.num_decoder_layers = ( - num_decoder_layers if num_decoder_layers is not None else self.num_layers + self.num_decoder_layers if self.num_decoder_layers is not None else self.num_layers ) # default = symmetry - self.num_heads = num_heads - self.relative_attention_num_buckets = relative_attention_num_buckets - self.relative_attention_max_distance = relative_attention_max_distance - self.dropout_rate = dropout_rate - self.classifier_dropout = classifier_dropout - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_factor = initializer_factor - self.feed_forward_proj = feed_forward_proj - self.use_cache = use_cache - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id act_info = self.feed_forward_proj.split("-") self.dense_act_fn = act_info[-1] self.is_gated_act = act_info[0] == "gated" - if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2: - raise ValueError( - f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer. " - "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. " - "'gated-gelu' or 'relu'" - ) # for backwards compatibility - if feed_forward_proj == "gated-gelu": + if self.feed_forward_proj == "gated-gelu": self.dense_act_fn = "gelu_new" # Super weird feature of T5 because we support T5 and T51.1 from the same @@ -105,10 +79,20 @@ def __init__( # The model code was relying on saved configs where `tie_word_embeddings` is # set to `False` in 1.1v and using it as indicator of whether to scale or not # But in fact we tie weights always and force it to be `True` - self.scale_decoder_outputs = tie_word_embeddings is True + self.scale_decoder_outputs = kwargs.pop("tie_word_embeddings", None) is not False self.tie_word_embeddings = True - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + act_info = self.feed_forward_proj.split("-") + if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2: + raise ValueError( + f"`feed_forward_proj`: {self.feed_forward_proj} is not a valid activation function of the dense layer. " + "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. " + "'gated-gelu' or 'relu'" + ) __all__ = ["T5Config"] diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index ccc1da020359..c828c8bc8e31 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -654,7 +654,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: err_msg_prefix = "decoder_" if self.is_decoder else "" @@ -895,7 +895,7 @@ def forward( >>> last_hidden_states = outputs.last_hidden_state ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1063,7 +1063,7 @@ def forward( >>> # studies have shown that owning a dog is good for you. ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1195,7 +1195,7 @@ def forward( >>> outputs = model(input_ids=input_ids) >>> last_hidden_states = outputs.last_hidden_state ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict encoder_outputs = self.encoder( input_ids=input_ids, @@ -1274,7 +1274,7 @@ def forward( Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: use_cache = False @@ -1398,7 +1398,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, @@ -1514,7 +1514,7 @@ def forward( Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict use_cache = use_cache if use_cache is not None else self.config.use_cache if start_positions is not None and end_positions is not None: use_cache = False @@ -1532,7 +1532,7 @@ def forward( decoder_input_ids = self._shift_right(input_ids) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index 15e6259c22cd..5d98a483d564 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -20,12 +20,15 @@ # limitations under the License. from typing import Any -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="google/t5_gemma_module-7b") +@strict(accept_kwargs=True) class T5GemmaModuleConfig(PreTrainedConfig): r""" query_pre_attn_scalar (`float`, *optional*, defaults to 256): @@ -62,72 +65,52 @@ class T5GemmaModuleConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = 30.0, - attn_logit_softcapping: float | None = 50.0, - is_decoder: bool | None = False, - **kwargs, - ): - self.is_decoder = is_decoder - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.query_pre_attn_scalar = query_pre_attn_scalar - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.attn_logit_softcapping = attn_logit_softcapping - self.layer_types = layer_types - + vocab_size: int = 256000 + hidden_size: int = 2304 + intermediate_size: int = 9216 + num_hidden_layers: int = 26 + num_attention_heads: int = 8 + num_key_value_heads: int = 4 + head_dim: int = 256 + hidden_activation: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + bos_token_id: int | None = 2 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + query_pre_attn_scalar: int = 256 + sliding_window: int | None = 4096 + layer_types: list[str] | None = None + final_logit_softcapping: float | None = 30.0 + attn_logit_softcapping: float | None = 50.0 + + is_decoder: bool = False + + def __post_init__(self, **kwargs): if self.layer_types is None: self.layer_types = [ "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters + super().__post_init__(**kwargs) - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) @auto_docstring(checkpoint="google/t5_gemma_module-7b") +@strict(accept_kwargs=True) class T5GemmaConfig(PreTrainedConfig): r""" encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*): @@ -147,60 +130,43 @@ class T5GemmaConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] sub_configs = {"encoder": T5GemmaModuleConfig, "decoder": T5GemmaModuleConfig} - def __init__( - self, - encoder: T5GemmaModuleConfig | dict[Any, Any] | None = None, - decoder: T5GemmaModuleConfig | dict[Any, Any] | None = None, - is_encoder_decoder: bool | None = True, - dropout_rate: float | None = 0.0, - classifier_dropout_rate: float | None = 0.0, - attention_dropout: float | None = 0.0, - tie_word_embeddings: bool | None = True, - vocab_size: int | None = 256000, - **kwargs, - ): - if isinstance(encoder, dict): - encoder = T5GemmaModuleConfig(**encoder) - elif encoder is None: - encoder = T5GemmaModuleConfig() - else: - assert isinstance(encoder, T5GemmaModuleConfig), f"{type(encoder)} is not supported." - - if isinstance(decoder, dict): - decoder = T5GemmaModuleConfig(**decoder) - elif decoder is None: - decoder = encoder - else: - assert isinstance(decoder, T5GemmaModuleConfig), f"{type(decoder)} is not supported." - - encoder = T5GemmaModuleConfig(**encoder.to_dict()) - decoder = T5GemmaModuleConfig(**decoder.to_dict()) - - encoder.is_decoder = False - encoder.dropout_rate = dropout_rate - encoder.attention_dropout = attention_dropout - self.encoder = encoder - - decoder.is_decoder = True - decoder.use_cache = True - decoder.dropout_rate = dropout_rate - decoder.attention_dropout = attention_dropout - decoder.cross_attention_hidden_size = encoder.hidden_size - self.decoder = decoder + encoder: T5GemmaModuleConfig | dict[Any, Any] | None = None + decoder: T5GemmaModuleConfig | dict[Any, Any] | None = None + is_encoder_decoder: bool = True + dropout_rate: int | float = 0.0 + classifier_dropout_rate: int | float = 0.0 + attention_dropout: float | int = 0.0 + tie_word_embeddings: bool = True + vocab_size: int = 256000 + + def __post_init__(self, **kwargs): + if isinstance(self.encoder, dict): + self.encoder = T5GemmaModuleConfig(**self.encoder) + elif self.encoder is None: + self.encoder = T5GemmaModuleConfig() + + if isinstance(self.decoder, dict): + self.decoder = T5GemmaModuleConfig(**self.decoder) + elif self.decoder is None: + self.decoder = T5GemmaModuleConfig() + + self.encoder.is_decoder = False + self.encoder.dropout_rate = self.dropout_rate + self.encoder.attention_dropout = self.attention_dropout + + self.decoder.is_decoder = True + self.decoder.use_cache = True + self.decoder.dropout_rate = self.dropout_rate + self.decoder.attention_dropout = self.attention_dropout + self.decoder.cross_attention_hidden_size = self.encoder.hidden_size + + self.initializer_range = kwargs.pop("initializer_range", self.decoder.initializer_range) for special_token_key in ["bos_token_id", "pad_token_id", "eos_token_id"]: if special_token_key not in kwargs: - kwargs[special_token_key] = getattr(decoder, special_token_key) - - super().__init__(**kwargs) - - self.is_encoder_decoder = is_encoder_decoder - self.initializer_range = kwargs.get("initializer_range", decoder.initializer_range) - self.classifier_dropout_rate = classifier_dropout_rate - self.tie_word_embeddings = tie_word_embeddings + kwargs[special_token_key] = getattr(self.decoder, special_token_key) - # Used in pipeline generation. - self.vocab_size = vocab_size + super().__post_init__(**kwargs) __all__ = ["T5GemmaConfig", "T5GemmaModuleConfig"] diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py index 445b1a349934..8f1b1cbf3f63 100644 --- a/src/transformers/models/t5gemma/modeling_t5gemma.py +++ b/src/transformers/models/t5gemma/modeling_t5gemma.py @@ -212,7 +212,7 @@ def eager_attention_forward( key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, - dropout: float = 0.0, + dropout: float | int = 0.0, scaling: float | None = None, softcap: float | None = None, **kwargs, diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index ac448c2e6337..17ebb6ee7108 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -17,6 +17,7 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache @@ -38,7 +39,6 @@ SequenceClassifierOutput, TokenClassifierOutput, ) -from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import ( @@ -60,13 +60,11 @@ ) -_CHECKPOINT_FOR_DOC = "google/t5gemma-2b-2b-prefixlm-it" - - logger = logging.get_logger(__name__) @auto_docstring(checkpoint="google/t5_gemma_module-7b") +@strict(accept_kwargs=True) class T5GemmaModuleConfig(Gemma2Config): r""" query_pre_attn_scalar (`float`, *optional*, defaults to 256): @@ -86,68 +84,12 @@ class T5GemmaModuleConfig(Gemma2Config): >>> configuration = model.config ```""" - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = 30.0, - attn_logit_softcapping: float | None = 50.0, - is_decoder: bool | None = False, - **kwargs, - ): - self.is_decoder = is_decoder - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - head_dim=head_dim, - hidden_activation=hidden_activation, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - bos_token_id=bos_token_id, - tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - query_pre_attn_scalar=query_pre_attn_scalar, - sliding_window=sliding_window, - layer_types=layer_types, - final_logit_softcapping=final_logit_softcapping, - attn_logit_softcapping=attn_logit_softcapping, - **kwargs, - ) - - del self.use_bidirectional_attention + is_decoder: bool = False + use_bidirectional_attention = AttributeError() @auto_docstring(checkpoint="google/t5_gemma_module-7b") +@strict(accept_kwargs=True) class T5GemmaConfig(PreTrainedConfig): r""" encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*): @@ -167,60 +109,43 @@ class T5GemmaConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] sub_configs = {"encoder": T5GemmaModuleConfig, "decoder": T5GemmaModuleConfig} - def __init__( - self, - encoder: T5GemmaModuleConfig | dict[Any, Any] | None = None, - decoder: T5GemmaModuleConfig | dict[Any, Any] | None = None, - is_encoder_decoder: bool | None = True, - dropout_rate: float | None = 0.0, - classifier_dropout_rate: float | None = 0.0, - attention_dropout: float | None = 0.0, - tie_word_embeddings: bool | None = True, - vocab_size: int | None = 256000, - **kwargs, - ): - if isinstance(encoder, dict): - encoder = T5GemmaModuleConfig(**encoder) - elif encoder is None: - encoder = T5GemmaModuleConfig() - else: - assert isinstance(encoder, T5GemmaModuleConfig), f"{type(encoder)} is not supported." - - if isinstance(decoder, dict): - decoder = T5GemmaModuleConfig(**decoder) - elif decoder is None: - decoder = encoder - else: - assert isinstance(decoder, T5GemmaModuleConfig), f"{type(decoder)} is not supported." - - encoder = T5GemmaModuleConfig(**encoder.to_dict()) - decoder = T5GemmaModuleConfig(**decoder.to_dict()) - - encoder.is_decoder = False - encoder.dropout_rate = dropout_rate - encoder.attention_dropout = attention_dropout - self.encoder = encoder - - decoder.is_decoder = True - decoder.use_cache = True - decoder.dropout_rate = dropout_rate - decoder.attention_dropout = attention_dropout - decoder.cross_attention_hidden_size = encoder.hidden_size - self.decoder = decoder + encoder: T5GemmaModuleConfig | dict[Any, Any] | None = None + decoder: T5GemmaModuleConfig | dict[Any, Any] | None = None + is_encoder_decoder: bool = True + dropout_rate: int | float = 0.0 + classifier_dropout_rate: int | float = 0.0 + attention_dropout: float | int = 0.0 + tie_word_embeddings: bool = True + vocab_size: int = 256000 + + def __post_init__(self, **kwargs): + if isinstance(self.encoder, dict): + self.encoder = T5GemmaModuleConfig(**self.encoder) + elif self.encoder is None: + self.encoder = T5GemmaModuleConfig() + + if isinstance(self.decoder, dict): + self.decoder = T5GemmaModuleConfig(**self.decoder) + elif self.decoder is None: + self.decoder = T5GemmaModuleConfig() + + self.encoder.is_decoder = False + self.encoder.dropout_rate = self.dropout_rate + self.encoder.attention_dropout = self.attention_dropout + + self.decoder.is_decoder = True + self.decoder.use_cache = True + self.decoder.dropout_rate = self.dropout_rate + self.decoder.attention_dropout = self.attention_dropout + self.decoder.cross_attention_hidden_size = self.encoder.hidden_size + + self.initializer_range = kwargs.pop("initializer_range", self.decoder.initializer_range) for special_token_key in ["bos_token_id", "pad_token_id", "eos_token_id"]: if special_token_key not in kwargs: - kwargs[special_token_key] = getattr(decoder, special_token_key) - - super().__init__(**kwargs) - - self.is_encoder_decoder = is_encoder_decoder - self.initializer_range = kwargs.get("initializer_range", decoder.initializer_range) - self.classifier_dropout_rate = classifier_dropout_rate - self.tie_word_embeddings = tie_word_embeddings + kwargs[special_token_key] = getattr(self.decoder, special_token_key) - # Used in pipeline generation. - self.vocab_size = vocab_size + super().__post_init__(**kwargs) class T5GemmaRMSNorm(Gemma2RMSNorm): diff --git a/src/transformers/models/t5gemma2/configuration_t5gemma2.py b/src/transformers/models/t5gemma2/configuration_t5gemma2.py index 8b69de4f253a..5a9e90e72604 100644 --- a/src/transformers/models/t5gemma2/configuration_t5gemma2.py +++ b/src/transformers/models/t5gemma2/configuration_t5gemma2.py @@ -20,8 +20,9 @@ # limitations under the License. from typing import Any -from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..siglip import SiglipVisionConfig @@ -30,6 +31,7 @@ @auto_docstring(checkpoint="google/t5gemma-2-270m-270m") +@strict(accept_kwargs=True) class T5Gemma2TextConfig(PreTrainedConfig): r""" query_pre_attn_scalar (`float`, *optional*, defaults to 256): @@ -58,72 +60,53 @@ class T5Gemma2TextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - default_theta = {"global": 1_000_000.0, "local": 10_000.0} - def __init__( - self, - vocab_size: int | None = 262_208, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 131_072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = None, - attn_logit_softcapping: float | None = None, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.query_pre_attn_scalar = query_pre_attn_scalar - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.attn_logit_softcapping = attn_logit_softcapping - self.layer_types = layer_types + vocab_size: int = 262_208 + hidden_size: int = 2304 + intermediate_size: int = 9216 + num_hidden_layers: int = 26 + num_attention_heads: int = 8 + num_key_value_heads: int = 4 + head_dim: int = 256 + hidden_activation: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 131_072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + bos_token_id: int | None = 2 + tie_word_embeddings: bool = True + rope_parameters: dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + query_pre_attn_scalar: int = 256 + sliding_window: int | None = 4096 + layer_types: list[str] | None = None + final_logit_softcapping: float | None = None + attn_logit_softcapping: float | None = None + default_theta = {"global": 1_000_000.0, "local": 10_000.0} + def __post_init__(self, **kwargs): # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub - self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6) - + _sliding_window_pattern = kwargs.pop("sliding_window_pattern", 6) if self.layer_types is None: self.layer_types = [ - "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention" + "sliding_attention" if bool((i + 1) % _sliding_window_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -150,11 +133,11 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs @auto_docstring(checkpoint="google/t5gemma-2-270m-270m") +@strict(accept_kwargs=True) class T5Gemma2EncoderConfig(PreTrainedConfig): r""" mm_tokens_per_image (`int`, *optional*, defaults to 256): @@ -197,43 +180,33 @@ class T5Gemma2EncoderConfig(PreTrainedConfig): "vision_config": SiglipVisionConfig, } - def __init__( - self, - text_config: T5Gemma2TextConfig | dict[str, Any] | None = None, - vision_config: SiglipVisionConfig | dict[str, Any] | None = None, - mm_tokens_per_image: int | None = 256, - boi_token_index: int | None = 255_999, - eoi_token_index: int | None = 256_000, - image_token_index: int | None = 262_144, - initializer_range: float | None = 0.02, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if text_config is None: - text_config = T5Gemma2TextConfig() + text_config: T5Gemma2TextConfig | dict[str, Any] | None = None + vision_config: SiglipVisionConfig | dict[str, Any] | None = None + mm_tokens_per_image: int | None = 256 + boi_token_index: int | None = 255_999 + eoi_token_index: int | None = 256_000 + image_token_index: int | None = 262_144 + initializer_range: float | None = 0.02 + tie_word_embeddings: bool | None = True + + def __post_init__(self, **kwargs): + if self.text_config is None: + self.text_config = T5Gemma2TextConfig() logger.info("text_config is None, using default T5Gemma2EncoderTextConfig text config.") - elif isinstance(text_config, dict): - text_config = T5Gemma2TextConfig(**text_config) + elif isinstance(self.text_config, dict): + self.text_config = T5Gemma2TextConfig(**self.text_config) - if isinstance(vision_config, dict): - vision_config = SiglipVisionConfig(**vision_config) - elif vision_config is None: - vision_config = SiglipVisionConfig() + if isinstance(self.vision_config, dict): + self.vision_config = SiglipVisionConfig(**self.vision_config) + elif self.vision_config is None: + self.vision_config = SiglipVisionConfig() logger.info("vision_config is None, using default SiglipVisionConfig vision config.") - self.text_config = text_config - self.vision_config = vision_config - self.mm_tokens_per_image = mm_tokens_per_image - self.boi_token_index = boi_token_index - self.eoi_token_index = eoi_token_index - self.image_token_index = image_token_index - self.initializer_range = initializer_range - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(**kwargs) + super().__post_init__(**kwargs) @auto_docstring(checkpoint="google/t5gemma-2-270m-270m") +@strict(accept_kwargs=True) class T5Gemma2DecoderConfig(PreTrainedConfig): r""" query_pre_attn_scalar (`float`, *optional*, defaults to 256): @@ -262,72 +235,53 @@ class T5Gemma2DecoderConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - default_theta = {"global": 1_000_000.0, "local": 10_000.0} - def __init__( - self, - vocab_size: int | None = 262_208, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 131_072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = None, - attn_logit_softcapping: float | None = None, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.query_pre_attn_scalar = query_pre_attn_scalar - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.attn_logit_softcapping = attn_logit_softcapping - self.layer_types = layer_types + vocab_size: int = 262_208 + hidden_size: int = 2304 + intermediate_size: int = 9216 + num_hidden_layers: int = 26 + num_attention_heads: int = 8 + num_key_value_heads: int = 4 + head_dim: int = 256 + hidden_activation: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 131_072 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + bos_token_id: int | None = 2 + tie_word_embeddings: bool = True + rope_parameters: dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + query_pre_attn_scalar: int = 256 + sliding_window: int | None = 4096 + layer_types: list[str] | None = None + final_logit_softcapping: float | None = None + attn_logit_softcapping: float | None = None + default_theta = {"global": 1_000_000.0, "local": 10_000.0} + def __post_init__(self, **kwargs): # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub - self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6) - + _sliding_window_pattern = kwargs.pop("sliding_window_pattern", 6) if self.layer_types is None: self.layer_types = [ - "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention" + "sliding_attention" if bool((i + 1) % _sliding_window_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - super().__init__(**kwargs) + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -354,17 +308,20 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs @auto_docstring(checkpoint="google/t5gemma-2-270m-270m") +@strict(accept_kwargs=True) class T5Gemma2Config(PreTrainedConfig): r""" encoder (`Union[T5Gemma2EncoderConfig, dict]`, optional, *optional*): Configuration for the encoder. decoder (`Union[T5Gemma2DecoderConfig, dict]`, optional, *optional*): Configuration for the decoder. + eoi_token_index (`int`, *optional*): + The end-of-image token index to wrap the image prompt. Will be same as + `self.encoder.eoi_token_index` ```python >>> from transformers import T5Gemma2Config, T5Gemma2Model @@ -386,75 +343,61 @@ class T5Gemma2Config(PreTrainedConfig): "eoi_token_id": "eoi_token_index", } - def __init__( - self, - encoder: T5Gemma2EncoderConfig | dict[str, Any] | None = None, - decoder: T5Gemma2DecoderConfig | dict[str, Any] | None = None, - is_encoder_decoder: bool = True, - dropout_rate: float = 0.0, - attention_dropout: float = 0.0, - classifier_dropout_rate: float = 0.0, - initializer_range: float = 0.02, - image_token_index: int = 256_001, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if isinstance(encoder, dict): - encoder = T5Gemma2EncoderConfig(**encoder) - elif encoder is None: - encoder = T5Gemma2EncoderConfig() + encoder: T5Gemma2EncoderConfig | dict[str, Any] | None = None + decoder: T5Gemma2DecoderConfig | dict[str, Any] | None = None + is_encoder_decoder: bool = True + dropout_rate: float = 0.0 + attention_dropout: float | int = 0.0 + classifier_dropout_rate: float = 0.0 + initializer_range: float = 0.02 + image_token_index: int = 256_001 + eoi_token_index: int | None = None + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.encoder, dict): + self.encoder = T5Gemma2EncoderConfig(**self.encoder) + elif self.encoder is None: + self.encoder = T5Gemma2EncoderConfig() logger.info("encoder is None, using default T5Gemma2EncoderConfig encoder config.") - else: - if not isinstance(encoder, T5Gemma2EncoderConfig): - raise ValueError(f"{type(encoder)} is not supported.") - - if isinstance(decoder, dict): - decoder = T5Gemma2DecoderConfig(**decoder) - elif decoder is None: - decoder = T5Gemma2DecoderConfig() + + if isinstance(self.decoder, dict): + self.decoder = T5Gemma2DecoderConfig(**self.decoder) + elif self.decoder is None: + self.decoder = T5Gemma2DecoderConfig() logger.info("decoder is None, using default T5Gemma2DecoderConfig decoder config.") - else: - if not isinstance(decoder, T5Gemma2DecoderConfig): - raise ValueError(f"{type(decoder)} is not supported.") - if encoder.text_config.hidden_size != decoder.hidden_size: + self.encoder.text_config.dropout_rate = self.dropout_rate + self.encoder.text_config.attention_dropout = self.attention_dropout + self.encoder.vision_config.attention_dropout = self.attention_dropout + self.encoder.image_token_index = self.image_token_index + + self.decoder.dropout_rate = self.dropout_rate + self.decoder.attention_dropout = self.attention_dropout + self.eoi_token_index = self.encoder.eoi_token_index + + for special_token_key in ["bos_token_id", "pad_token_id", "eos_token_id", "vocab_size"]: + if special_token_key not in kwargs: + kwargs[special_token_key] = getattr(self.decoder, special_token_key) + + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.encoder.text_config.hidden_size != self.decoder.hidden_size: raise ValueError( "Imbalanced encoder-decoder is not supported in T5Gemma2: " - f"encoder ({encoder.text_config.hidden_size}) vs decoder ({decoder.hidden_size})." + f"encoder ({self.encoder.text_config.hidden_size}) vs decoder ({self.decoder.hidden_size})." ) - if not is_encoder_decoder: + if not self.is_encoder_decoder: raise ValueError("T5Gemma2Model only support encoder-decoder modeling.") - if encoder.text_config.vocab_size != decoder.vocab_size: + if self.encoder.text_config.vocab_size != self.decoder.vocab_size: raise ValueError( "Imbalanced encoder-decoder vocabulary size is not supported in T5Gemma2: " - f"encoder ({encoder.text_config.vocab_size}) vs decoder ({decoder.vocab_size})." + f"encoder ({self.encoder.text_config.vocab_size}) vs decoder ({self.decoder.vocab_size})." ) - # Encoder. - encoder.text_config.dropout_rate = dropout_rate - encoder.text_config.attention_dropout = attention_dropout - encoder.vision_config.attention_dropout = attention_dropout - encoder.image_token_index = image_token_index - self.encoder = encoder - - # Decoder. - decoder.dropout_rate = dropout_rate - decoder.attention_dropout = attention_dropout - self.decoder = decoder - - for special_token_key in ["bos_token_id", "pad_token_id", "eos_token_id", "vocab_size"]: - if special_token_key not in kwargs: - kwargs[special_token_key] = getattr(decoder, special_token_key) - - self.classifier_dropout_rate = classifier_dropout_rate - self.initializer_range = initializer_range - self.eoi_token_index = encoder.eoi_token_index - self.image_token_index = image_token_index - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - __all__ = ["T5Gemma2Config", "T5Gemma2TextConfig", "T5Gemma2EncoderConfig", "T5Gemma2DecoderConfig"] diff --git a/src/transformers/models/t5gemma2/modeling_t5gemma2.py b/src/transformers/models/t5gemma2/modeling_t5gemma2.py index 62ea0e03696d..2582dfac7d99 100644 --- a/src/transformers/models/t5gemma2/modeling_t5gemma2.py +++ b/src/transformers/models/t5gemma2/modeling_t5gemma2.py @@ -223,7 +223,7 @@ def eager_attention_forward( key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, - dropout: float = 0.0, + dropout: float | int = 0.0, scaling: float | None = None, softcap: float | None = None, **kwargs, diff --git a/src/transformers/models/t5gemma2/modular_t5gemma2.py b/src/transformers/models/t5gemma2/modular_t5gemma2.py index 37d0da71458e..90b172e9b4d3 100644 --- a/src/transformers/models/t5gemma2/modular_t5gemma2.py +++ b/src/transformers/models/t5gemma2/modular_t5gemma2.py @@ -18,10 +18,11 @@ import torch import torch.nn as nn +from huggingface_hub.dataclasses import strict from ... import initialization as init from ...cache_utils import DynamicCache, EncoderDecoderCache, StaticCache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...generation import GenerationConfig, GenerationMixin, GenerationMode from ...masking_utils import create_bidirectional_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs @@ -34,7 +35,7 @@ SequenceClassifierOutput, TokenClassifierOutput, ) -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, RopeParameters +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import ( @@ -73,6 +74,7 @@ @auto_docstring(checkpoint="google/t5gemma-2-270m-270m") +@strict(accept_kwargs=True) class T5Gemma2TextConfig(Gemma3TextConfig, PreTrainedConfig): r""" query_pre_attn_scalar (`float`, *optional*, defaults to 256): @@ -84,72 +86,22 @@ class T5Gemma2TextConfig(Gemma3TextConfig, PreTrainedConfig): """ model_type = "t5gemma2_text" + use_bidirectional_attention = AttributeError() - def __init__( - self, - vocab_size: int | None = 262_208, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 131_072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = None, - attn_logit_softcapping: float | None = None, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.query_pre_attn_scalar = query_pre_attn_scalar - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.attn_logit_softcapping = attn_logit_softcapping - self.layer_types = layer_types - + def __post_init__(self, **kwargs): # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub - self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6) - + _sliding_window_pattern = kwargs.pop("sliding_window_pattern", 6) if self.layer_types is None: self.layer_types = [ - "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention" + "sliding_attention" if bool((i + 1) % _sliding_window_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - PreTrainedConfig.__init__(**kwargs) + PreTrainedConfig.__post_init__(**kwargs) @auto_docstring(checkpoint="google/t5gemma-2-270m-270m") +@strict(accept_kwargs=True) class T5Gemma2EncoderConfig(Gemma3Config): model_type = "t5gemma2_encoder" @@ -160,6 +112,7 @@ class T5Gemma2EncoderConfig(Gemma3Config): @auto_docstring(checkpoint="google/t5gemma-2-270m-270m") +@strict(accept_kwargs=True) class T5Gemma2DecoderConfig(Gemma3TextConfig, PreTrainedConfig): r""" query_pre_attn_scalar (`float`, *optional*, defaults to 256): @@ -171,78 +124,31 @@ class T5Gemma2DecoderConfig(Gemma3TextConfig, PreTrainedConfig): """ model_type = "t5gemma2_decoder" + use_bidirectional_attention = AttributeError() - def __init__( - self, - vocab_size: int | None = 262_208, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 131_072, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = None, - attn_logit_softcapping: float | None = None, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.query_pre_attn_scalar = query_pre_attn_scalar - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.attn_logit_softcapping = attn_logit_softcapping - self.layer_types = layer_types - + def __post_init__(self, **kwargs): # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub - self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6) - + _sliding_window_pattern = kwargs.pop("sliding_window_pattern", 6) if self.layer_types is None: self.layer_types = [ - "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention" + "sliding_attention" if bool((i + 1) % _sliding_window_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters - PreTrainedConfig.__init__(**kwargs) + PreTrainedConfig.__post_init__(**kwargs) @auto_docstring(checkpoint="google/t5gemma-2-270m-270m") +@strict(accept_kwargs=True) class T5Gemma2Config(PreTrainedConfig): r""" encoder (`Union[T5Gemma2EncoderConfig, dict]`, optional, *optional*): Configuration for the encoder. decoder (`Union[T5Gemma2DecoderConfig, dict]`, optional, *optional*): Configuration for the decoder. + eoi_token_index (`int`, *optional*): + The end-of-image token index to wrap the image prompt. Will be same as + `self.encoder.eoi_token_index` ```python >>> from transformers import T5Gemma2Config, T5Gemma2Model @@ -264,76 +170,62 @@ class T5Gemma2Config(PreTrainedConfig): "eoi_token_id": "eoi_token_index", } - def __init__( - self, - encoder: T5Gemma2EncoderConfig | dict[str, Any] | None = None, - decoder: T5Gemma2DecoderConfig | dict[str, Any] | None = None, - is_encoder_decoder: bool = True, - dropout_rate: float = 0.0, - attention_dropout: float = 0.0, - classifier_dropout_rate: float = 0.0, - initializer_range: float = 0.02, - image_token_index: int = 256_001, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - if isinstance(encoder, dict): - encoder = T5Gemma2EncoderConfig(**encoder) - elif encoder is None: - encoder = T5Gemma2EncoderConfig() + encoder: T5Gemma2EncoderConfig | dict[str, Any] | None = None + decoder: T5Gemma2DecoderConfig | dict[str, Any] | None = None + is_encoder_decoder: bool = True + dropout_rate: float = 0.0 + attention_dropout: float | int = 0.0 + classifier_dropout_rate: float = 0.0 + initializer_range: float = 0.02 + image_token_index: int = 256_001 + eoi_token_index: int | None = None + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if isinstance(self.encoder, dict): + self.encoder = T5Gemma2EncoderConfig(**self.encoder) + elif self.encoder is None: + self.encoder = T5Gemma2EncoderConfig() logger.info("encoder is None, using default T5Gemma2EncoderConfig encoder config.") - else: - if not isinstance(encoder, T5Gemma2EncoderConfig): - raise ValueError(f"{type(encoder)} is not supported.") - if isinstance(decoder, dict): - decoder = T5Gemma2DecoderConfig(**decoder) - elif decoder is None: - decoder = T5Gemma2DecoderConfig() + if isinstance(self.decoder, dict): + self.decoder = T5Gemma2DecoderConfig(**self.decoder) + elif self.decoder is None: + self.decoder = T5Gemma2DecoderConfig() logger.info("decoder is None, using default T5Gemma2DecoderConfig decoder config.") - else: - if not isinstance(decoder, T5Gemma2DecoderConfig): - raise ValueError(f"{type(decoder)} is not supported.") - if encoder.text_config.hidden_size != decoder.hidden_size: + self.encoder.text_config.dropout_rate = self.dropout_rate + self.encoder.text_config.attention_dropout = self.attention_dropout + self.encoder.vision_config.attention_dropout = self.attention_dropout + self.encoder.image_token_index = self.image_token_index + + self.decoder.dropout_rate = self.dropout_rate + self.decoder.attention_dropout = self.attention_dropout + self.eoi_token_index = self.encoder.eoi_token_index + + for special_token_key in ["bos_token_id", "pad_token_id", "eos_token_id", "vocab_size"]: + if special_token_key not in kwargs: + kwargs[special_token_key] = getattr(self.decoder, special_token_key) + + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.encoder.text_config.hidden_size != self.decoder.hidden_size: raise ValueError( "Imbalanced encoder-decoder is not supported in T5Gemma2: " - f"encoder ({encoder.text_config.hidden_size}) vs decoder ({decoder.hidden_size})." + f"encoder ({self.encoder.text_config.hidden_size}) vs decoder ({self.decoder.hidden_size})." ) - if not is_encoder_decoder: + if not self.is_encoder_decoder: raise ValueError("T5Gemma2Model only support encoder-decoder modeling.") - if encoder.text_config.vocab_size != decoder.vocab_size: + if self.encoder.text_config.vocab_size != self.decoder.vocab_size: raise ValueError( "Imbalanced encoder-decoder vocabulary size is not supported in T5Gemma2: " - f"encoder ({encoder.text_config.vocab_size}) vs decoder ({decoder.vocab_size})." + f"encoder ({self.encoder.text_config.vocab_size}) vs decoder ({self.decoder.vocab_size})." ) - # Encoder. - encoder.text_config.dropout_rate = dropout_rate - encoder.text_config.attention_dropout = attention_dropout - encoder.vision_config.attention_dropout = attention_dropout - encoder.image_token_index = image_token_index - self.encoder = encoder - - # Decoder. - decoder.dropout_rate = dropout_rate - decoder.attention_dropout = attention_dropout - self.decoder = decoder - - for special_token_key in ["bos_token_id", "pad_token_id", "eos_token_id", "vocab_size"]: - if special_token_key not in kwargs: - kwargs[special_token_key] = getattr(decoder, special_token_key) - - self.classifier_dropout_rate = classifier_dropout_rate - self.initializer_range = initializer_range - self.eoi_token_index = encoder.eoi_token_index - self.image_token_index = image_token_index - self.tie_word_embeddings = tie_word_embeddings - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - class T5Gemma2RMSNorm(Gemma3RMSNorm): pass diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py index 9e26ff0ed85c..f0d00b0afd3f 100644 --- a/src/transformers/models/table_transformer/configuration_table_transformer.py +++ b/src/transformers/models/table_transformer/configuration_table_transformer.py @@ -13,16 +13,16 @@ # limitations under the License. """Table Transformer model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="microsoft/table-transformer-detection") +@strict(accept_kwargs=True) class TableTransformerConfig(PreTrainedConfig): r""" num_queries (`int`, *optional*, defaults to 100): @@ -59,55 +59,53 @@ class TableTransformerConfig(PreTrainedConfig): attribute_map = { "hidden_size": "d_model", "num_attention_heads": "encoder_attention_heads", + "num_hidden_layers": "encoder_layers", } - # Copied from transformers.models.detr.configuration_detr.DetrConfig.__init__ - def __init__( - self, - backbone_config=None, - num_channels=3, - num_queries=100, - encoder_layers=6, - encoder_ffn_dim=2048, - encoder_attention_heads=8, - decoder_layers=6, - decoder_ffn_dim=2048, - decoder_attention_heads=8, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - is_encoder_decoder=True, - activation_function="relu", - d_model=256, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - init_xavier_std=1.0, - auxiliary_loss=False, - position_embedding_type="sine", - dilation=False, - class_cost=1, - bbox_cost=5, - giou_cost=2, - mask_loss_coefficient=1, - dice_loss_coefficient=1, - bbox_loss_coefficient=5, - giou_loss_coefficient=2, - eos_coefficient=0.1, - **kwargs, - ): + backbone_config: dict | PreTrainedConfig | None = None + num_channels: int = 3 + num_queries: int = 100 + encoder_layers: int = 6 + encoder_ffn_dim: int = 2048 + encoder_attention_heads: int = 8 + decoder_layers: int = 6 + decoder_ffn_dim: int = 2048 + decoder_attention_heads: int = 8 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + is_encoder_decoder: bool = True + activation_function: str = "relu" + d_model: int = 256 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + init_xavier_std: float = 1.0 + auxiliary_loss: bool = False + position_embedding_type: str = "sine" + dilation: bool = False + class_cost: int = 1 + bbox_cost: int = 5 + giou_cost: int = 2 + mask_loss_coefficient: int = 1 + dice_loss_coefficient: int = 1 + bbox_loss_coefficient: int = 5 + giou_loss_coefficient: int = 2 + eos_coefficient: float = 0.1 + + def __post_init__(self, **kwargs): backbone_kwargs = kwargs.get("backbone_kwargs", {}) timm_default_kwargs = { - "num_channels": backbone_kwargs.get("num_channels", num_channels), + "num_channels": backbone_kwargs.get("num_channels", self.num_channels), "features_only": True, "use_pretrained_backbone": False, "out_indices": backbone_kwargs.get("out_indices", [1, 2, 3, 4]), } - if dilation: + if self.dilation: timm_default_kwargs["output_stride"] = backbone_kwargs.get("output_stride", 16) - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_backbone="resnet50", default_config_type="resnet", default_config_kwargs={"out_features": ["stage4"]}, @@ -115,38 +113,7 @@ def __init__( **kwargs, ) - self.backbone_config = backbone_config - self.num_channels = num_channels - self.num_queries = num_queries - self.d_model = d_model - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.init_xavier_std = init_xavier_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.num_hidden_layers = encoder_layers - self.auxiliary_loss = auxiliary_loss - self.position_embedding_type = position_embedding_type - # Hungarian matcher - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - # Loss coefficients - self.mask_loss_coefficient = mask_loss_coefficient - self.dice_loss_coefficient = dice_loss_coefficient - self.bbox_loss_coefficient = bbox_loss_coefficient - self.giou_loss_coefficient = giou_loss_coefficient - self.eos_coefficient = eos_coefficient - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__post_init__(**kwargs) __all__ = ["TableTransformerConfig"] diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 68c66ce8248b..554b8917f067 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -752,7 +752,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict hidden_states = inputs_embeds hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) @@ -887,7 +887,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if inputs_embeds is not None: hidden_states = inputs_embeds @@ -1061,7 +1061,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict batch_size, num_channels, height, width = pixel_values.shape device = pixel_values.device @@ -1228,7 +1228,7 @@ def forward( ... ) Detected table with confidence 1.0 at location [202.1, 210.59, 1119.22, 385.09] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # First, sent images through TABLE_TRANSFORMER base model to obtain encoder + decoder outputs outputs = self.model( diff --git a/src/transformers/models/tapas/configuration_tapas.py b/src/transformers/models/tapas/configuration_tapas.py index 2d32cbfa850e..1292e95b6f50 100644 --- a/src/transformers/models/tapas/configuration_tapas.py +++ b/src/transformers/models/tapas/configuration_tapas.py @@ -21,11 +21,14 @@ """ +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="google/tapas-base-finetuned-sqa") +@strict(accept_kwargs=True) class TapasConfig(PreTrainedConfig): r""" type_vocab_sizes (`list[int]`, *optional*, defaults to `[3, 256, 256, 2, 256, 256, 10]`): @@ -100,104 +103,53 @@ class TapasConfig(PreTrainedConfig): model_type = "tapas" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=1024, - type_vocab_sizes=[3, 256, 256, 2, 256, 256, 10], - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - bos_token_id=None, - eos_token_id=None, - positive_label_weight=10.0, - num_aggregation_labels=0, - aggregation_loss_weight=1.0, - use_answer_as_supervision=None, - answer_loss_importance=1.0, - use_normalized_answer_loss=False, - huber_loss_delta=None, - temperature=1.0, - aggregation_temperature=1.0, - use_gumbel_for_cells=False, - use_gumbel_for_aggregation=False, - average_approximation_function="ratio", - cell_selection_preference=None, - answer_loss_cutoff=None, - max_num_rows=64, - max_num_columns=32, - average_logits_per_cell=False, - select_one_column=True, - allow_empty_column_selection=False, - init_cell_selection_weights_to_zero=False, - reset_position_index_per_cell=True, - disable_per_token_loss=False, - aggregation_labels=None, - no_aggregation_label_index=None, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - - # BERT hyperparameters (with updated max_position_embeddings and type_vocab_sizes) - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_sizes = type_vocab_sizes - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - - # Fine-tuning task hyperparameters - self.positive_label_weight = positive_label_weight - self.num_aggregation_labels = num_aggregation_labels - self.aggregation_loss_weight = aggregation_loss_weight - self.use_answer_as_supervision = use_answer_as_supervision - self.answer_loss_importance = answer_loss_importance - self.use_normalized_answer_loss = use_normalized_answer_loss - self.huber_loss_delta = huber_loss_delta - self.temperature = temperature - self.aggregation_temperature = aggregation_temperature - self.use_gumbel_for_cells = use_gumbel_for_cells - self.use_gumbel_for_aggregation = use_gumbel_for_aggregation - self.average_approximation_function = average_approximation_function - self.cell_selection_preference = cell_selection_preference - self.answer_loss_cutoff = answer_loss_cutoff - self.max_num_rows = max_num_rows - self.max_num_columns = max_num_columns - self.average_logits_per_cell = average_logits_per_cell - self.select_one_column = select_one_column - self.allow_empty_column_selection = allow_empty_column_selection - self.init_cell_selection_weights_to_zero = init_cell_selection_weights_to_zero - self.reset_position_index_per_cell = reset_position_index_per_cell - self.disable_per_token_loss = disable_per_token_loss - - # Aggregation hyperparameters - self.aggregation_labels = aggregation_labels - self.no_aggregation_label_index = no_aggregation_label_index - + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 1024 + type_vocab_sizes: list[int] | tuple[int, ...] = (3, 256, 256, 2, 256, 256, 10) + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 0 + bos_token_id: int | None = None + eos_token_id: int | list[int] | None = None + positive_label_weight: float = 10.0 + num_aggregation_labels: int = 0 + aggregation_loss_weight: float = 1.0 + use_answer_as_supervision: bool | None = None + answer_loss_importance: float = 1.0 + use_normalized_answer_loss: bool = False + huber_loss_delta: float | None = None + temperature: float = 1.0 + aggregation_temperature: float = 1.0 + use_gumbel_for_cells: bool = False + use_gumbel_for_aggregation: bool = False + average_approximation_function: str = "ratio" + cell_selection_preference: float | None = None + answer_loss_cutoff: float | int | None = None + max_num_rows: int = 64 + max_num_columns: int = 32 + average_logits_per_cell: bool = False + select_one_column: bool = True + allow_empty_column_selection: bool = False + init_cell_selection_weights_to_zero: bool = False + reset_position_index_per_cell: bool = True + disable_per_token_loss: bool = False + aggregation_labels: dict | None = None + no_aggregation_label_index: int | None = None + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): if isinstance(self.aggregation_labels, dict): - self.aggregation_labels = {int(k): v for k, v in aggregation_labels.items()} + self.aggregation_labels = {int(k): v for k, v in self.aggregation_labels.items()} + super().__post_init__(**kwargs) __all__ = ["TapasConfig"] diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index 5d0a98b6b8b6..279056d5b37f 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -611,7 +611,7 @@ class for more info. output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -757,7 +757,7 @@ class for more info. >>> outputs = model(**inputs, labels=labels) >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.tapas( input_ids, @@ -900,7 +900,7 @@ class for more info. >>> logits = outputs.logits >>> logits_aggregation = outputs.logits_aggregation ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.tapas( input_ids, @@ -1209,7 +1209,7 @@ class for more info. >>> loss = outputs.loss >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.tapas( input_ids, diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py index 5f3f7888c933..a71463f9b12a 100644 --- a/src/transformers/models/textnet/configuration_textnet.py +++ b/src/transformers/models/textnet/configuration_textnet.py @@ -13,15 +13,15 @@ # limitations under the License. """TextNet model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="czczup/textnet-base") +@strict(accept_kwargs=True) class TextNetConfig(BackboneConfigMixin, PreTrainedConfig): r""" stem_kernel_size (`int`, *optional*, defaults to 3): @@ -62,52 +62,37 @@ class TextNetConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "textnet" - def __init__( - self, - stem_kernel_size=3, - stem_stride=2, - stem_num_channels=3, - stem_out_channels=64, - stem_act_func="relu", - image_size=[640, 640], - conv_layer_kernel_sizes=None, - conv_layer_strides=None, - hidden_sizes=[64, 64, 128, 256, 512], - batch_norm_eps=1e-5, - initializer_range=0.02, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) - - if conv_layer_kernel_sizes is None: - conv_layer_kernel_sizes = [ + stem_kernel_size: int = 3 + stem_stride: int = 2 + stem_num_channels: int = 3 + stem_out_channels: int = 64 + stem_act_func: str = "relu" + image_size: list[int] | tuple[int, int] | int = (640, 640) + conv_layer_kernel_sizes: list | None = None + conv_layer_strides: list | None = None + hidden_sizes: list[int] | tuple[int, ...] = (64, 64, 128, 256, 512) + batch_norm_eps: float = 1e-5 + initializer_range: float = 0.02 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + + def __post_init__(self, **kwargs): + if self.conv_layer_kernel_sizes is None: + self.conv_layer_kernel_sizes = [ [[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]], ] - if conv_layer_strides is None: - conv_layer_strides = [[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]] - - self.stem_kernel_size = stem_kernel_size - self.stem_stride = stem_stride - self.stem_num_channels = stem_num_channels - self.stem_out_channels = stem_out_channels - self.stem_act_func = stem_act_func - - self.image_size = image_size - self.conv_layer_kernel_sizes = conv_layer_kernel_sizes - self.conv_layer_strides = conv_layer_strides - - self.initializer_range = initializer_range - self.hidden_sizes = hidden_sizes - self.batch_norm_eps = batch_norm_eps + if self.conv_layer_strides is None: + self.conv_layer_strides = [[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]] self.depths = [len(layer) for layer in self.conv_layer_kernel_sizes] self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, 5)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["TextNetConfig"] diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py index ecc2c6cc271a..e18619dd7953 100644 --- a/src/transformers/models/textnet/modeling_textnet.py +++ b/src/transformers/models/textnet/modeling_textnet.py @@ -237,7 +237,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple[Any, list[Any]] | tuple[Any] | BaseModelOutputWithPoolingAndNoAttention: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -319,7 +319,7 @@ def forward( >>> outputs.logits.shape torch.Size([1, 2]) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.textnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) last_hidden_state = outputs[0] @@ -386,7 +386,7 @@ def forward( >>> with torch.no_grad(): >>> outputs = model(**inputs) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py index f5f8e2faad58..8c41d49ff838 100644 --- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py +++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py @@ -13,14 +13,14 @@ # limitations under the License. """Time Series Transformer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="huggingface/time-series-transformer-tourism-monthly") +@strict(accept_kwargs=True) class TimeSeriesTransformerConfig(PreTrainedConfig): r""" prediction_length (`int`): @@ -80,92 +80,61 @@ class TimeSeriesTransformerConfig(PreTrainedConfig): "num_hidden_layers": "encoder_layers", } - def __init__( - self, - prediction_length: int | None = None, - context_length: int | None = None, - distribution_output: str = "student_t", - loss: str = "nll", - input_size: int = 1, - lags_sequence: list[int] = [1, 2, 3, 4, 5, 6, 7], - scaling: str | bool | None = "mean", - num_dynamic_real_features: int = 0, - num_static_categorical_features: int = 0, - num_static_real_features: int = 0, - num_time_features: int = 0, - cardinality: list[int] | None = None, - embedding_dimension: list[int] | None = None, - encoder_ffn_dim: int = 32, - decoder_ffn_dim: int = 32, - encoder_attention_heads: int = 2, - decoder_attention_heads: int = 2, - encoder_layers: int = 2, - decoder_layers: int = 2, - is_encoder_decoder: bool = True, - activation_function: str = "gelu", - d_model: int = 64, - dropout: float = 0.1, - encoder_layerdrop: float = 0.1, - decoder_layerdrop: float = 0.1, - attention_dropout: float = 0.1, - activation_dropout: float = 0.1, - num_parallel_samples: int = 100, - init_std: float = 0.02, - use_cache=True, - **kwargs, - ): - # time series specific configuration - self.prediction_length = prediction_length - self.context_length = context_length or prediction_length - self.distribution_output = distribution_output - self.loss = loss - self.input_size = input_size - self.num_time_features = num_time_features - self.lags_sequence = lags_sequence - self.scaling = scaling - self.num_dynamic_real_features = num_dynamic_real_features - self.num_static_real_features = num_static_real_features - self.num_static_categorical_features = num_static_categorical_features - if cardinality and num_static_categorical_features > 0: - if len(cardinality) != num_static_categorical_features: + prediction_length: int | None = None + context_length: int | None = None + distribution_output: str = "student_t" + loss: str = "nll" + input_size: int = 1 + lags_sequence: list[int] | tuple[int, ...] = (1, 2, 3, 4, 5, 6, 7) + scaling: str | bool | None = "mean" + num_dynamic_real_features: int = 0 + num_static_categorical_features: int = 0 + num_static_real_features: int = 0 + num_time_features: int = 0 + cardinality: list[int] | None = None + embedding_dimension: list[int] | None = None + encoder_ffn_dim: int = 32 + decoder_ffn_dim: int = 32 + encoder_attention_heads: int = 2 + decoder_attention_heads: int = 2 + encoder_layers: int = 2 + decoder_layers: int = 2 + is_encoder_decoder: bool = True + activation_function: str = "gelu" + d_model: int = 64 + dropout: float | int = 0.1 + encoder_layerdrop: float | int = 0.1 + decoder_layerdrop: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + num_parallel_samples: int = 100 + init_std: float = 0.02 + use_cache: bool = True + + def __post_init__(self, **kwargs): + if not (self.cardinality and self.num_static_categorical_features > 0): + self.cardinality = [0] + + if not (self.embedding_dimension and self.num_static_categorical_features > 0): + self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality] + + self.context_length = self.context_length or self.prediction_length + self.feature_size = self.input_size * len(self.lags_sequence) + self._number_of_features + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.cardinality and self.num_static_categorical_features > 0: + if len(self.cardinality) != self.num_static_categorical_features: raise ValueError( "The cardinality should be a list of the same length as `num_static_categorical_features`" ) - self.cardinality = cardinality - else: - self.cardinality = [0] - if embedding_dimension and num_static_categorical_features > 0: - if len(embedding_dimension) != num_static_categorical_features: + + if self.embedding_dimension and self.num_static_categorical_features > 0: + if len(self.embedding_dimension) != self.num_static_categorical_features: raise ValueError( "The embedding dimension should be a list of the same length as `num_static_categorical_features`" ) - self.embedding_dimension = embedding_dimension - else: - self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality] - self.num_parallel_samples = num_parallel_samples - - # Transformer architecture configuration - self.feature_size = input_size * len(lags_sequence) + self._number_of_features - self.d_model = d_model - self.encoder_attention_heads = encoder_attention_heads - self.decoder_attention_heads = decoder_attention_heads - self.encoder_ffn_dim = encoder_ffn_dim - self.decoder_ffn_dim = decoder_ffn_dim - self.encoder_layers = encoder_layers - self.decoder_layers = decoder_layers - - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - - self.activation_function = activation_function - self.init_std = init_std - - self.use_cache = use_cache - - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property def _number_of_features(self) -> int: diff --git a/src/transformers/models/timesfm/configuration_timesfm.py b/src/transformers/models/timesfm/configuration_timesfm.py index 9dcd70f98c62..225b57451d8e 100644 --- a/src/transformers/models/timesfm/configuration_timesfm.py +++ b/src/transformers/models/timesfm/configuration_timesfm.py @@ -13,14 +13,14 @@ # limitations under the License. """TimesFM model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/timesfm-2.0-500m-pytorch") +@strict(accept_kwargs=True) class TimesFmConfig(PreTrainedConfig): r""" patch_length (`int`, *optional*, defaults to 32): @@ -53,49 +53,24 @@ class TimesFmConfig(PreTrainedConfig): keys_to_ignore_at_inference = [] is_encoder_decoder = False - def __init__( - self, - patch_length: int = 32, - context_length: int = 512, - horizon_length: int = 128, - freq_size: int = 3, - num_hidden_layers: int = 50, - hidden_size: int = 1280, - intermediate_size: int = 1280, - head_dim: int = 80, - num_attention_heads: int = 16, - tolerance: float = 1e-6, - rms_norm_eps: float = 1e-6, - quantiles: list[float] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], - pad_val: float = 1123581321.0, - attention_dropout: float = 0.0, - use_positional_embedding: bool = False, - initializer_range: float = 0.02, - min_timescale: int = 1, - max_timescale: int = 10_000, - **kwargs, - ): - self.patch_length = patch_length - self.context_length = context_length - self.horizon_length = horizon_length - self.quantiles = quantiles - self.pad_val = pad_val - self.freq_size = freq_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.head_dim = head_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.tolerance = tolerance - self.rms_norm_eps = rms_norm_eps - self.attention_dropout = attention_dropout - self.use_positional_embedding = use_positional_embedding - self.initializer_range = initializer_range - self.min_timescale = min_timescale - self.max_timescale = max_timescale - - kwargs["is_encoder_decoder"] = self.is_encoder_decoder - super().__init__(**kwargs) + patch_length: int = 32 + context_length: int = 512 + horizon_length: int = 128 + freq_size: int = 3 + num_hidden_layers: int = 50 + hidden_size: int = 1280 + intermediate_size: int = 1280 + head_dim: int = 80 + num_attention_heads: int = 16 + tolerance: float = 1e-6 + rms_norm_eps: float = 1e-6 + quantiles: list[float] | tuple[float, ...] = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) + pad_val: float = 1123581321.0 + attention_dropout: float | int = 0.0 + use_positional_embedding: bool = False + initializer_range: float = 0.02 + min_timescale: int = 1 + max_timescale: int = 10_000 __all__ = ["TimesFmConfig"] diff --git a/src/transformers/models/timesfm/modeling_timesfm.py b/src/transformers/models/timesfm/modeling_timesfm.py index 646cc1d387d4..219908c1e47c 100644 --- a/src/transformers/models/timesfm/modeling_timesfm.py +++ b/src/transformers/models/timesfm/modeling_timesfm.py @@ -190,7 +190,7 @@ def simple_eager_attention_forward( value_states: torch.Tensor, attention_mask: torch.Tensor | None, scaling: float, - dropout: float = 0.0, + dropout: float | int = 0.0, **kwargs: Unpack[TransformersKwargs], ): attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * scaling diff --git a/src/transformers/models/timesfm2_5/configuration_timesfm2_5.py b/src/transformers/models/timesfm2_5/configuration_timesfm2_5.py index 8faa7acba1b6..2686f67570bd 100644 --- a/src/transformers/models/timesfm2_5/configuration_timesfm2_5.py +++ b/src/transformers/models/timesfm2_5/configuration_timesfm2_5.py @@ -18,12 +18,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="google/timesfm-2.5-200m-transformers") +@strict(accept_kwargs=True) class TimesFm2_5Config(PreTrainedConfig): r""" patch_length (`int`, *optional*, defaults to 32): @@ -62,59 +65,30 @@ class TimesFm2_5Config(PreTrainedConfig): keys_to_ignore_at_inference = [] is_encoder_decoder = False - def __init__( - self, - patch_length: int = 32, - context_length: int = 16384, - horizon_length: int = 128, - quantiles: list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], - hidden_size: int = 1280, - intermediate_size: int = 1280, - head_dim: int = 80, - num_attention_heads: int = 16, - num_key_value_heads: int = 16, - num_hidden_layers: int = 20, - rms_norm_eps: float = 1e-6, - attention_dropout: float = 0.0, - attention_bias: bool = False, - initializer_range: float = 0.02, - output_quantile_len: int = 1024, - decode_index: int = 5, - use_bias: bool = False, - activation: str = "swish", - use_continuous_quantile_head: bool = True, - force_flip_invariance: bool = True, - infer_is_positive: bool = True, - max_position_embeddings: int = 16384, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - self.num_key_value_heads = num_key_value_heads - self.attention_bias = attention_bias - self.output_quantile_len = output_quantile_len - self.decode_index = decode_index - self.use_bias = use_bias - self.activation = activation - self.use_continuous_quantile_head = use_continuous_quantile_head - self.force_flip_invariance = force_flip_invariance - self.infer_is_positive = infer_is_positive - self.max_position_embeddings = max_position_embeddings - self.rope_parameters = rope_parameters - self.patch_length = patch_length - self.context_length = context_length - self.horizon_length = horizon_length - self.quantiles = quantiles - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.head_dim = head_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.rms_norm_eps = rms_norm_eps - self.attention_dropout = attention_dropout - self.initializer_range = initializer_range + patch_length: int = 32 - kwargs["is_encoder_decoder"] = self.is_encoder_decoder - super().__init__(**kwargs) + context_length: int = 16384 + horizon_length: int = 128 + num_hidden_layers: int = 20 + hidden_size: int = 1280 + intermediate_size: int = 1280 + head_dim: int = 80 + num_attention_heads: int = 16 + rms_norm_eps: float = 1e-6 + quantiles: list[float] | tuple[float, ...] = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + num_key_value_heads: int = 16 + attention_bias: bool = False + output_quantile_len: int = 1024 + decode_index: int = 5 + use_bias: bool = False + activation: str = "swish" + use_continuous_quantile_head: bool = True + force_flip_invariance: bool = True + infer_is_positive: bool = True + max_position_embeddings: int = 16384 + rope_parameters: RopeParameters | dict | None = None __all__ = ["TimesFm2_5Config"] diff --git a/src/transformers/models/timesfm2_5/modular_timesfm2_5.py b/src/transformers/models/timesfm2_5/modular_timesfm2_5.py index 51db5c2f62e6..0221592956d4 100644 --- a/src/transformers/models/timesfm2_5/modular_timesfm2_5.py +++ b/src/transformers/models/timesfm2_5/modular_timesfm2_5.py @@ -19,6 +19,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from ...activations import ACT2FN from ...masking_utils import create_causal_mask @@ -51,6 +52,7 @@ @auto_docstring(checkpoint="google/timesfm-2.5-200m-transformers") +@strict(accept_kwargs=True) class TimesFm2_5Config(TimesFmConfig): r""" patch_length (`int`, *optional*, defaults to 32): @@ -85,74 +87,26 @@ class TimesFm2_5Config(TimesFmConfig): ``` """ - model_type = "timesfm2_5" - - def __init__( - self, - patch_length: int = 32, - context_length: int = 16384, - horizon_length: int = 128, - quantiles: list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], - hidden_size: int = 1280, - intermediate_size: int = 1280, - head_dim: int = 80, - num_attention_heads: int = 16, - num_key_value_heads: int = 16, - num_hidden_layers: int = 20, - rms_norm_eps: float = 1e-6, - attention_dropout: float = 0.0, - attention_bias: bool = False, - initializer_range: float = 0.02, - output_quantile_len: int = 1024, - decode_index: int = 5, - use_bias: bool = False, - activation: str = "swish", - use_continuous_quantile_head: bool = True, - force_flip_invariance: bool = True, - infer_is_positive: bool = True, - max_position_embeddings: int = 16384, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - **kwargs, - ): - self.num_key_value_heads = num_key_value_heads - self.attention_bias = attention_bias - self.output_quantile_len = output_quantile_len - self.decode_index = decode_index - self.use_bias = use_bias - self.activation = activation - self.use_continuous_quantile_head = use_continuous_quantile_head - self.force_flip_invariance = force_flip_invariance - self.infer_is_positive = infer_is_positive - self.max_position_embeddings = max_position_embeddings - self.rope_parameters = rope_parameters - - super().__init__( - patch_length=patch_length, - context_length=context_length, - horizon_length=horizon_length, - quantiles=quantiles, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - head_dim=head_dim, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - rms_norm_eps=rms_norm_eps, - attention_dropout=attention_dropout, - attention_bias=attention_bias, - initializer_range=initializer_range, - num_hidden_layers=num_hidden_layers, - use_positional_embedding=False, - **kwargs, - ) - # Delete inherited attributes that TimesFM 2.5 does not use - del self.freq_size - del self.pad_val - del self.tolerance - del self.normalize_inputs - del self.use_positional_embedding - del self.use_rotary_embeddings - del self.min_timescale - del self.max_timescale + context_length: int = 16384 + num_key_value_heads: int = 16 + num_hidden_layers: int = 20 + attention_bias: bool = False + output_quantile_len: int = 1024 + decode_index: int = 5 + use_bias: bool = False + activation: str = "swish" + use_continuous_quantile_head: bool = True + force_flip_invariance: bool = True + infer_is_positive: bool = True + max_position_embeddings: int = 16384 + rope_parameters: RopeParameters | dict | None = None + + freq_size = AttributeError() + pad_val = AttributeError() + tolerance = AttributeError() + use_positional_embedding = AttributeError() + min_timescale = AttributeError() + max_timescale = AttributeError() @dataclass diff --git a/src/transformers/models/timesformer/configuration_timesformer.py b/src/transformers/models/timesformer/configuration_timesformer.py index a4498c855762..1ceb2fbdd7b2 100644 --- a/src/transformers/models/timesformer/configuration_timesformer.py +++ b/src/transformers/models/timesformer/configuration_timesformer.py @@ -13,14 +13,14 @@ # limitations under the License. """TimeSformer model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/timesformer-base-finetuned-k600") +@strict(accept_kwargs=True) class TimesformerConfig(PreTrainedConfig): r""" num_frames (`int`, *optional*, defaults to 8): @@ -45,46 +45,22 @@ class TimesformerConfig(PreTrainedConfig): model_type = "timesformer" - def __init__( - self, - image_size=224, - patch_size=16, - num_channels=3, - num_frames=8, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - qkv_bias=True, - attention_type="divided_space_time", - drop_path_rate=0, - **kwargs, - ): - super().__init__(**kwargs) - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.num_frames = num_frames - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.qkv_bias = qkv_bias - - self.attention_type = attention_type - self.drop_path_rate = drop_path_rate + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + num_frames: int = 8 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + qkv_bias: bool = True + attention_type: str = "divided_space_time" + drop_path_rate: int = 0 __all__ = ["TimesformerConfig"] diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py index 0f42f7427538..4ee3378471bc 100644 --- a/src/transformers/models/timesformer/modeling_timesformer.py +++ b/src/transformers/models/timesformer/modeling_timesformer.py @@ -571,7 +571,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict embedding_output = self.embeddings(pixel_values) @@ -706,7 +706,7 @@ def forward( >>> print(model.config.id2label[predicted_label]) eating spaghetti ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.timesformer( pixel_values, diff --git a/src/transformers/models/timm_backbone/configuration_timm_backbone.py b/src/transformers/models/timm_backbone/configuration_timm_backbone.py index ac49c90cab14..357298f1fd96 100644 --- a/src/transformers/models/timm_backbone/configuration_timm_backbone.py +++ b/src/transformers/models/timm_backbone/configuration_timm_backbone.py @@ -14,15 +14,15 @@ """Configuration for Backbone models""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="") +@strict(accept_kwargs=True) class TimmBackboneConfig(BackboneConfigMixin, PreTrainedConfig): r""" backbone (`str`, *optional*): @@ -49,24 +49,16 @@ class TimmBackboneConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "timm_backbone" - def __init__( - self, - backbone=None, - num_channels=3, - features_only=True, - out_indices=None, - freeze_batch_norm_2d=False, - output_stride=None, - **kwargs, - ): - self.backbone = backbone - self.num_channels = num_channels - self.features_only = features_only - self.out_indices = out_indices if out_indices is not None else [-1] - self.output_stride = output_stride - self.freeze_batch_norm_2d = freeze_batch_norm_2d - - super().__init__(**kwargs) + backbone: str | None = None + num_channels: int = 3 + features_only: bool = True + _out_indices: list[int] | None = None + freeze_batch_norm_2d: bool = False + output_stride: int | None = None + + def __post_init__(self, **kwargs): + self.out_indices = self.out_indices if self.out_indices is not None else [-1] + super().__post_init__(**kwargs) @property def out_indices(self): diff --git a/src/transformers/models/timm_backbone/modeling_timm_backbone.py b/src/transformers/models/timm_backbone/modeling_timm_backbone.py index ca5a848626f8..c60606a7657e 100644 --- a/src/transformers/models/timm_backbone/modeling_timm_backbone.py +++ b/src/transformers/models/timm_backbone/modeling_timm_backbone.py @@ -127,7 +127,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> BackboneOutput | tuple[Tensor, ...]: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py b/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py index 491daef242f6..d1717b7ed491 100644 --- a/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py +++ b/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py @@ -16,18 +16,18 @@ from typing import Any +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, is_timm_available, logging, requires_backends +from ...utils import auto_docstring, is_timm_available, requires_backends if is_timm_available(): from timm.data import ImageNetInfo, infer_imagenet_subset -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="resnet50") +@strict(accept_kwargs=True) class TimmWrapperConfig(PreTrainedConfig): r""" architecture (`str`, *optional*, defaults to `"resnet50"`): @@ -52,19 +52,10 @@ class TimmWrapperConfig(PreTrainedConfig): model_type = "timm_wrapper" - def __init__( - self, - architecture: str = "resnet50", - initializer_range: float = 0.02, - do_pooling: bool = True, - model_args: dict[str, Any] | None = None, - **kwargs, - ): - self.architecture = architecture - self.initializer_range = initializer_range - self.do_pooling = do_pooling - self.model_args = model_args # named "model_args" for BC with timm - super().__init__(**kwargs) + architecture: str = "resnet50" + initializer_range: float = 0.02 + do_pooling: bool = True + model_args: dict[str, Any] | None = None @classmethod def from_dict(cls, config_dict: dict[str, Any], **kwargs): diff --git a/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py b/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py index d03a3b1b276d..e49f3d77011f 100644 --- a/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py +++ b/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py @@ -225,7 +225,7 @@ def forward( >>> last_hidden_state = outputs.last_hidden_state ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -352,7 +352,7 @@ def forward( >>> top5_probabilities, top5_class_indices = torch.topk(logits.softmax(dim=1) * 100, k=5) ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/trocr/configuration_trocr.py b/src/transformers/models/trocr/configuration_trocr.py index 8b32ebac6569..014921933c7e 100644 --- a/src/transformers/models/trocr/configuration_trocr.py +++ b/src/transformers/models/trocr/configuration_trocr.py @@ -13,14 +13,14 @@ # limitations under the License. """TrOCR model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/trocr-base-handwritten") +@strict(accept_kwargs=True) class TrOCRConfig(PreTrainedConfig): r""" use_learned_position_embeddings (`bool`, *optional*, defaults to `True`): @@ -51,58 +51,29 @@ class TrOCRConfig(PreTrainedConfig): "num_hidden_layers": "decoder_layers", } - def __init__( - self, - vocab_size=50265, - d_model=1024, - decoder_layers=12, - decoder_attention_heads=16, - decoder_ffn_dim=4096, - activation_function="gelu", - max_position_embeddings=512, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - decoder_start_token_id=2, - init_std=0.02, - decoder_layerdrop=0.0, - use_cache=True, - scale_embedding=False, - use_learned_position_embeddings=True, - layernorm_embedding=True, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - cross_attention_hidden_size=None, - is_decoder=False, - tie_word_embeddings=True, - **kwargs, - ): - self.cross_attention_hidden_size = cross_attention_hidden_size - self.is_decoder = is_decoder - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.d_model = d_model - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.activation_function = activation_function - self.max_position_embeddings = max_position_embeddings - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.init_std = init_std - self.decoder_layerdrop = decoder_layerdrop - self.use_cache = use_cache - self.scale_embedding = scale_embedding - self.use_learned_position_embeddings = use_learned_position_embeddings - self.layernorm_embedding = layernorm_embedding - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - - super().__init__(**kwargs) + vocab_size: int = 50265 + d_model: int = 1024 + decoder_layers: int = 12 + decoder_attention_heads: int = 16 + decoder_ffn_dim: int = 4096 + activation_function: str = "gelu" + max_position_embeddings: int = 512 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + decoder_start_token_id: int = 2 + init_std: float = 0.02 + decoder_layerdrop: float | int = 0.0 + use_cache: bool = True + scale_embedding: bool = False + use_learned_position_embeddings: bool = True + layernorm_embedding: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + cross_attention_hidden_size: int | None = None + is_decoder: bool = False + tie_word_embeddings: bool = True __all__ = ["TrOCRConfig"] diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index 12cbf5c254ed..31a1dae3cf73 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -509,7 +509,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: @@ -751,7 +751,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model.decoder( diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py index 4fa06822ba41..0019fb885d9d 100644 --- a/src/transformers/models/tvp/configuration_tvp.py +++ b/src/transformers/models/tvp/configuration_tvp.py @@ -13,16 +13,16 @@ # limitations under the License. """TVP model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="Intel/tvp-base") +@strict(accept_kwargs=True) class TvpConfig(PreTrainedConfig): r""" distance_loss_weight (`float`, *optional*, defaults to 1.0): @@ -53,65 +53,39 @@ class TvpConfig(PreTrainedConfig): model_type = "tvp" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - backbone_config=None, - distance_loss_weight=1.0, - duration_loss_weight=0.1, - visual_prompter_type="framepad", - visual_prompter_apply="replace", - visual_prompt_size=96, - max_img_size=448, - num_frames=48, - vocab_size=30522, - type_vocab_size=2, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - max_position_embeddings=512, - max_grid_col_position_embeddings=100, - max_grid_row_position_embeddings=100, - hidden_dropout_prob=0.1, - hidden_act="gelu", - layer_norm_eps=1e-12, - initializer_range=0.02, - attention_probs_dropout_prob=0.1, - pad_token_id=None, - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + distance_loss_weight: float = 1.0 + duration_loss_weight: float = 0.1 + visual_prompter_type: str = "framepad" + visual_prompter_apply: str = "replace" + visual_prompt_size: int = 96 + max_img_size: int = 448 + num_frames: int = 48 + vocab_size: int = 30522 + type_vocab_size: int = 2 + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + max_position_embeddings: int = 512 + max_grid_col_position_embeddings: int = 100 + max_grid_row_position_embeddings: int = 100 + hidden_dropout_prob: float = 0.1 + hidden_act: str = "gelu" + layer_norm_eps: float = 1e-12 + initializer_range: float = 0.02 + attention_probs_dropout_prob: float = 0.1 + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="resnet", default_config_kwargs={"out_features": ["stage4"]}, **kwargs, ) - self.backbone_config = backbone_config - self.distance_loss_weight = distance_loss_weight - self.duration_loss_weight = duration_loss_weight - self.visual_prompter_type = visual_prompter_type - self.visual_prompter_apply = visual_prompter_apply - self.visual_prompt_size = visual_prompt_size - self.max_img_size = max_img_size - self.num_frames = num_frames - self.vocab_size = vocab_size - self.type_vocab_size = type_vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.max_grid_col_position_embeddings = max_grid_col_position_embeddings - self.max_grid_row_position_embeddings = max_grid_row_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_dropout_prob = hidden_dropout_prob - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.pad_token_id = pad_token_id - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["TvpConfig"] diff --git a/src/transformers/models/udop/configuration_udop.py b/src/transformers/models/udop/configuration_udop.py index 5d91c78815ca..a7cc63158ad2 100644 --- a/src/transformers/models/udop/configuration_udop.py +++ b/src/transformers/models/udop/configuration_udop.py @@ -13,14 +13,14 @@ # limitations under the License. """UDOP model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/udop-large") +@strict(accept_kwargs=True) class UdopConfig(PreTrainedConfig): r""" relative_attention_num_buckets (`int`, *optional*, defaults to 32): @@ -42,78 +42,57 @@ class UdopConfig(PreTrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"} - def __init__( - self, - vocab_size=33201, - d_model=1024, - d_kv=64, - d_ff=4096, - num_layers=24, - num_decoder_layers=None, - num_heads=16, - relative_attention_num_buckets=32, - relative_attention_max_distance=128, - relative_bias_args=[{"type": "1d"}, {"type": "horizontal"}, {"type": "vertical"}], - dropout_rate=0.1, - layer_norm_epsilon=1e-6, - initializer_factor=1.0, - feed_forward_proj="relu", - is_encoder_decoder=True, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - max_2d_position_embeddings=1024, - image_size=224, - patch_size=16, - num_channels=3, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.d_model = d_model - self.d_kv = d_kv - self.d_ff = d_ff - self.num_layers = num_layers + vocab_size: int = 33201 + d_model: int = 1024 + d_kv: int = 64 + d_ff: int = 4096 + num_layers: int = 24 + num_decoder_layers: int | None = None + num_heads: int = 16 + relative_attention_num_buckets: int = 32 + relative_attention_max_distance: int = 128 + relative_bias_args: list[dict] | None = None + dropout_rate: float = 0.1 + layer_norm_epsilon: float = 1e-6 + initializer_factor: float = 1.0 + feed_forward_proj: str = "relu" + is_encoder_decoder: bool = True + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | None = 1 + max_2d_position_embeddings: int = 1024 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + if self.relative_bias_args is None: + self.relative_bias_args = [{"type": "1d"}, {"type": "horizontal"}, {"type": "vertical"}] + self.num_decoder_layers = ( - num_decoder_layers if num_decoder_layers is not None else self.num_layers + self.num_decoder_layers if self.num_decoder_layers is not None else self.num_layers ) # default = symmetry - self.num_heads = num_heads - self.relative_attention_num_buckets = relative_attention_num_buckets - self.relative_attention_max_distance = relative_attention_max_distance - self.dropout_rate = dropout_rate - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_factor = initializer_factor - self.feed_forward_proj = feed_forward_proj - self.use_cache = use_cache - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - - # UDOP attributes - self.max_2d_position_embeddings = max_2d_position_embeddings - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - if not isinstance(relative_bias_args, list): - raise TypeError("`relative_bias_args` should be a list of dictionaries.") - self.relative_bias_args = relative_bias_args act_info = self.feed_forward_proj.split("-") self.dense_act_fn = act_info[-1] self.is_gated_act = act_info[0] == "gated" + kwargs.pop("tie_word_embeddings", None) + self.tie_word_embeddings = True + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + act_info = self.feed_forward_proj.split("-") if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2: raise ValueError( - f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer." + f"`feed_forward_proj`: {self.feed_forward_proj} is not a valid activation function of the dense layer." "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. " "'gated-gelu' or 'relu'" ) - self.tie_word_embeddings = True - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - __all__ = ["UdopConfig"] diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 0e10b214037d..d9186136b555 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -1086,7 +1086,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # input embeddings processing @@ -1367,7 +1367,7 @@ def forward( [1, 1, 1024] ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1547,7 +1547,7 @@ def forward( ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if decoder_input_ids is None and labels is not None: decoder_input_ids = self._shift_right(labels) @@ -1707,7 +1707,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict encoder_outputs = self.encoder( input_ids=input_ids, diff --git a/src/transformers/models/umt5/configuration_umt5.py b/src/transformers/models/umt5/configuration_umt5.py index c775a5622832..b792ff6f1930 100644 --- a/src/transformers/models/umt5/configuration_umt5.py +++ b/src/transformers/models/umt5/configuration_umt5.py @@ -13,14 +13,14 @@ # limitations under the License. """UMT5 model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/umt5-small") +@strict(accept_kwargs=True) class UMT5Config(PreTrainedConfig): r""" relative_attention_num_buckets (`int`, *optional*, defaults to 32): @@ -42,76 +42,53 @@ class UMT5Config(PreTrainedConfig): "head_dim": "d_kv", } - def __init__( - self, - vocab_size=250112, - d_model=512, - d_kv=64, - d_ff=1024, - num_layers=8, - num_decoder_layers=None, - num_heads=6, - relative_attention_num_buckets=32, - relative_attention_max_distance=128, - dropout_rate=0.1, - layer_norm_epsilon=1e-6, - initializer_factor=1.0, - feed_forward_proj="gated-gelu", - is_encoder_decoder=True, - use_cache=True, - tokenizer_class="T5Tokenizer", - pad_token_id=0, - eos_token_id=1, - decoder_start_token_id=0, - classifier_dropout=0.0, - is_decoder=False, - **kwargs, - ): - self.is_decoder = is_decoder - self.vocab_size = vocab_size - self.d_model = d_model - self.d_kv = d_kv - self.d_ff = d_ff - self.num_layers = num_layers + vocab_size: int = 250112 + d_model: int = 512 + d_kv: int = 64 + d_ff: int = 1024 + num_layers: int = 8 + num_decoder_layers: int | None = None + num_heads: int = 6 + relative_attention_num_buckets: int = 32 + relative_attention_max_distance: int = 128 + dropout_rate: float = 0.1 + layer_norm_epsilon: float = 1e-6 + initializer_factor: float = 1.0 + feed_forward_proj: str = "gated-gelu" + is_encoder_decoder: bool = True + use_cache: bool = True + tokenizer_class: str = "T5Tokenizer" + pad_token_id: int | None = 0 + eos_token_id: int | None = 1 + decoder_start_token_id: int | None = 0 + classifier_dropout: float | int = 0.0 + is_decoder: bool = False + + def __post_init__(self, **kwargs): self.num_decoder_layers = ( - num_decoder_layers if num_decoder_layers is not None else self.num_layers + self.num_decoder_layers if self.num_decoder_layers is not None else self.num_layers ) # default = symmetry - self.num_heads = num_heads - self.relative_attention_num_buckets = relative_attention_num_buckets - self.relative_attention_max_distance = relative_attention_max_distance - self.dropout_rate = dropout_rate - self.classifier_dropout = classifier_dropout - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_factor = initializer_factor - self.feed_forward_proj = feed_forward_proj - self.use_cache = use_cache - act_info = self.feed_forward_proj.split("-") self.dense_act_fn = act_info[-1] self.is_gated_act = act_info[0] == "gated" + if self.feed_forward_proj == "gated-gelu": + self.dense_act_fn = "gelu_new" + + kwargs.pop("tie_word_embeddings", None) + self.tie_word_embeddings = True # force it for T5 family + + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + act_info = self.feed_forward_proj.split("-") if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2: raise ValueError( - f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer. " + f"`feed_forward_proj`: {self.feed_forward_proj} is not a valid activation function of the dense layer. " "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. " "'gated-gelu' or 'relu'" ) - if feed_forward_proj == "gated-gelu": - self.dense_act_fn = "gelu_new" - - self.tokenizer_class = tokenizer_class - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - # Force tie_word_embeddings to `True` for T5 family - kwargs.pop("tie_word_embeddings", None) - self.tie_word_embeddings = True - - super().__init__( - is_encoder_decoder=is_encoder_decoder, - **kwargs, - ) - __all__ = ["UMT5Config"] diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py index f18e8393834d..c556382531cc 100644 --- a/src/transformers/models/umt5/modeling_umt5.py +++ b/src/transformers/models/umt5/modeling_umt5.py @@ -620,7 +620,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: err_msg_prefix = "decoder_" if self.is_decoder else "" @@ -882,7 +882,7 @@ def forward( >>> last_hidden_states = outputs.last_hidden_state ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1065,7 +1065,7 @@ def forward( >>> tokenizer.decode(outputs[0], skip_special_tokens=True) ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1221,7 +1221,7 @@ def forward( >>> outputs = model(input_ids=input_ids) >>> last_hidden_states = outputs.last_hidden_state ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict encoder_outputs = self.encoder( input_ids=input_ids, @@ -1301,7 +1301,7 @@ def forward( Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: use_cache = False @@ -1429,7 +1429,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, @@ -1546,7 +1546,7 @@ def forward( Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict use_cache = use_cache if use_cache is not None else self.config.use_cache if start_positions is not None and end_positions is not None: use_cache = False @@ -1564,7 +1564,7 @@ def forward( decoder_input_ids = self._shift_right(input_ids) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py index e266eac00b9a..974ec1f30126 100644 --- a/src/transformers/models/unispeech/configuration_unispeech.py +++ b/src/transformers/models/unispeech/configuration_unispeech.py @@ -16,14 +16,14 @@ import functools import operator -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/unispeech-large-1500h-cv") +@strict(accept_kwargs=True) class UniSpeechConfig(PreTrainedConfig): r""" feat_proj_dropout (`float`, *optional*, defaults to 0.0): @@ -135,89 +135,60 @@ class UniSpeechConfig(PreTrainedConfig): model_type = "unispeech" - def __init__( - self, - vocab_size=32, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - feat_proj_dropout=0.0, - feat_quantizer_dropout=0.0, - final_dropout=0.1, - layerdrop=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - feat_extract_norm="group", - feat_extract_activation="gelu", - conv_dim=(512, 512, 512, 512, 512, 512, 512), - conv_stride=(5, 2, 2, 2, 2, 2, 2), - conv_kernel=(10, 3, 3, 3, 3, 2, 2), - conv_bias=False, - num_conv_pos_embeddings=128, - num_conv_pos_embedding_groups=16, - do_stable_layer_norm=False, - apply_spec_augment=True, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - mask_feature_min_masks=0, - num_codevectors_per_group=320, - num_codevector_groups=2, - contrastive_logits_temperature=0.1, - num_negatives=100, - codevector_dim=256, - proj_codevector_dim=256, - diversity_loss_weight=0.1, - ctc_loss_reduction="mean", - ctc_zero_infinity=False, - use_weighted_layer_sum=False, - classifier_proj_size=256, - num_ctc_classes=80, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - replace_prob=0.5, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.hidden_size = hidden_size - self.feat_extract_norm = feat_extract_norm - self.feat_extract_activation = feat_extract_activation - self.conv_dim = list(conv_dim) - self.conv_stride = list(conv_stride) - self.conv_kernel = list(conv_kernel) - self.conv_bias = conv_bias - self.num_conv_pos_embeddings = num_conv_pos_embeddings - self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + vocab_size: int = 32 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + feat_proj_dropout: float | int = 0.0 + feat_quantizer_dropout: float | int = 0.0 + final_dropout: float | int = 0.1 + layerdrop: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + feat_extract_norm: str = "group" + feat_extract_activation: str = "gelu" + conv_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 512, 512, 512) + conv_stride: list[int] | tuple[int, ...] = (5, 2, 2, 2, 2, 2, 2) + conv_kernel: list[int] | tuple[int, ...] = (10, 3, 3, 3, 3, 2, 2) + conv_bias: bool = False + num_conv_pos_embeddings: int = 128 + num_conv_pos_embedding_groups: int = 16 + do_stable_layer_norm: bool = False + apply_spec_augment: bool = True + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + mask_feature_min_masks: int = 0 + num_codevectors_per_group: int = 320 + num_codevector_groups: int = 2 + contrastive_logits_temperature: float = 0.1 + num_negatives: int = 100 + codevector_dim: int = 256 + proj_codevector_dim: int = 256 + diversity_loss_weight: float = 0.1 + ctc_loss_reduction: str = "mean" + ctc_zero_infinity: bool = False + use_weighted_layer_sum: bool = False + classifier_proj_size: int = 256 + num_ctc_classes: int = 80 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + replace_prob: float = 0.5 + + def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_attention_heads = num_attention_heads - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.feat_proj_dropout = feat_proj_dropout - self.final_dropout = final_dropout - self.layerdrop = layerdrop - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.num_ctc_classes = num_ctc_classes - self.vocab_size = vocab_size - self.do_stable_layer_norm = do_stable_layer_norm - self.use_weighted_layer_sum = use_weighted_layer_sum - self.classifier_proj_size = classifier_proj_size + return super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if ( (len(self.conv_stride) != self.num_feat_extract_layers) or (len(self.conv_kernel) != self.num_feat_extract_layers) @@ -230,32 +201,6 @@ def __init__( f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." ) - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.apply_spec_augment = apply_spec_augment - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - self.mask_feature_min_masks = mask_feature_min_masks - - # parameters for pretraining with codevector quantized representations - self.num_codevectors_per_group = num_codevectors_per_group - self.num_codevector_groups = num_codevector_groups - self.contrastive_logits_temperature = contrastive_logits_temperature - self.feat_quantizer_dropout = feat_quantizer_dropout - self.num_negatives = num_negatives - self.codevector_dim = codevector_dim - self.proj_codevector_dim = proj_codevector_dim - self.diversity_loss_weight = diversity_loss_weight - - # ctc loss - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - - # pretraining loss - self.replace_prob = replace_prob - @property def inputs_to_logits_ratio(self): return functools.reduce(operator.mul, self.conv_stride, 1) diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index a66f17d88b18..26f1f8e64ce2 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1009,7 +1009,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) @@ -1120,7 +1120,7 @@ def forward( >>> # TODO: Add full pretraining example ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.unispeech( input_values, @@ -1266,7 +1266,7 @@ def forward( All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -1384,7 +1384,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.unispeech( diff --git a/src/transformers/models/unispeech/modular_unispeech.py b/src/transformers/models/unispeech/modular_unispeech.py index 07131202ecba..6c94a57f0973 100644 --- a/src/transformers/models/unispeech/modular_unispeech.py +++ b/src/transformers/models/unispeech/modular_unispeech.py @@ -253,7 +253,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) @@ -364,7 +364,7 @@ def forward( >>> # TODO: Add full pretraining example ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.unispeech( input_values, diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py index 8c4886f2a4b1..c1a149473436 100644 --- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py @@ -16,14 +16,14 @@ import functools import operator -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/unispeech-sat-base-100h-libri-ft") +@strict(accept_kwargs=True) class UniSpeechSatConfig(PreTrainedConfig): r""" feat_proj_dropout (`float`, *optional*, defaults to 0.0): @@ -144,91 +144,63 @@ class UniSpeechSatConfig(PreTrainedConfig): model_type = "unispeech-sat" - def __init__( - self, - vocab_size=32, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - feat_proj_dropout=0.0, - feat_quantizer_dropout=0.0, - final_dropout=0.1, - layerdrop=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - feat_extract_norm="group", - feat_extract_activation="gelu", - conv_dim=(512, 512, 512, 512, 512, 512, 512), - conv_stride=(5, 2, 2, 2, 2, 2, 2), - conv_kernel=(10, 3, 3, 3, 3, 2, 2), - conv_bias=False, - num_conv_pos_embeddings=128, - num_conv_pos_embedding_groups=16, - do_stable_layer_norm=False, - apply_spec_augment=True, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - mask_feature_min_masks=0, - num_codevectors_per_group=320, - num_codevector_groups=2, - contrastive_logits_temperature=0.1, - num_negatives=100, - codevector_dim=256, - proj_codevector_dim=256, - diversity_loss_weight=0.1, - ctc_loss_reduction="mean", - ctc_zero_infinity=False, - use_weighted_layer_sum=False, - classifier_proj_size=256, - tdnn_dim=(512, 512, 512, 512, 1500), - tdnn_kernel=(5, 3, 3, 1, 1), - tdnn_dilation=(1, 2, 3, 1, 1), - xvector_output_dim=512, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - num_clusters=504, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.hidden_size = hidden_size - self.feat_extract_norm = feat_extract_norm - self.feat_extract_activation = feat_extract_activation - self.conv_dim = list(conv_dim) - self.conv_stride = list(conv_stride) - self.conv_kernel = list(conv_kernel) - self.conv_bias = conv_bias - self.num_conv_pos_embeddings = num_conv_pos_embeddings - self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + vocab_size: int = 32 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + feat_proj_dropout: float | int = 0.0 + feat_quantizer_dropout: float | int = 0.0 + final_dropout: float | int = 0.1 + layerdrop: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + feat_extract_norm: str = "group" + feat_extract_activation: str = "gelu" + conv_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 512, 512, 512) + conv_stride: list[int] | tuple[int, ...] = (5, 2, 2, 2, 2, 2, 2) + conv_kernel: list[int] | tuple[int, ...] = (10, 3, 3, 3, 3, 2, 2) + conv_bias: bool = False + num_conv_pos_embeddings: int = 128 + num_conv_pos_embedding_groups: int = 16 + do_stable_layer_norm: bool = False + apply_spec_augment: bool = True + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + mask_feature_min_masks: int = 0 + num_codevectors_per_group: int = 320 + num_codevector_groups: int = 2 + contrastive_logits_temperature: float = 0.1 + num_negatives: int = 100 + codevector_dim: int = 256 + proj_codevector_dim: int = 256 + diversity_loss_weight: float = 0.1 + ctc_loss_reduction: str = "mean" + ctc_zero_infinity: bool = False + use_weighted_layer_sum: bool = False + classifier_proj_size: int = 256 + tdnn_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 1500) + tdnn_kernel: list[int] | tuple[int, ...] = (5, 3, 3, 1, 1) + tdnn_dilation: list[int] | tuple[int, ...] = (1, 2, 3, 1, 1) + xvector_output_dim: int = 512 + pad_token_id: int = 0 + bos_token_id: int = 1 + eos_token_id: int = 2 + num_clusters: int = 504 + + def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_attention_heads = num_attention_heads - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.feat_proj_dropout = feat_proj_dropout - self.final_dropout = final_dropout - self.layerdrop = layerdrop - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.vocab_size = vocab_size - self.num_clusters = num_clusters - self.do_stable_layer_norm = do_stable_layer_norm - self.use_weighted_layer_sum = use_weighted_layer_sum + return super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if ( (len(self.conv_stride) != self.num_feat_extract_layers) or (len(self.conv_kernel) != self.num_feat_extract_layers) @@ -241,38 +213,6 @@ def __init__( f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." ) - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.apply_spec_augment = apply_spec_augment - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - self.mask_feature_min_masks = mask_feature_min_masks - - # parameters for pretraining with codevector quantized representations - self.num_codevectors_per_group = num_codevectors_per_group - self.num_codevector_groups = num_codevector_groups - self.contrastive_logits_temperature = contrastive_logits_temperature - self.feat_quantizer_dropout = feat_quantizer_dropout - self.num_negatives = num_negatives - self.codevector_dim = codevector_dim - self.proj_codevector_dim = proj_codevector_dim - self.diversity_loss_weight = diversity_loss_weight - - # ctc loss - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - - # SequenceClassification-specific parameter. Feel free to ignore for other classes. - self.classifier_proj_size = classifier_proj_size - - # XVector-specific parameters. Feel free to ignore for other classes. - self.tdnn_dim = list(tdnn_dim) - self.tdnn_kernel = list(tdnn_kernel) - self.tdnn_dilation = list(tdnn_dilation) - self.xvector_output_dim = xvector_output_dim - @property def inputs_to_logits_ratio(self): return functools.reduce(operator.mul, self.conv_stride, 1) diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 945b3ab0e437..bb97b659fae7 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1014,7 +1014,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) @@ -1133,7 +1133,7 @@ def forward( >>> # TODO: Add full pretraining example ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.unispeech_sat( input_values, @@ -1262,7 +1262,7 @@ def forward( All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -1380,7 +1380,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.unispeech_sat( @@ -1484,7 +1484,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.unispeech_sat( @@ -1656,7 +1656,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.unispeech_sat( diff --git a/src/transformers/models/unispeech_sat/modular_unispeech_sat.py b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py index 13bf9cd5103f..c445c42b9139 100644 --- a/src/transformers/models/unispeech_sat/modular_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py @@ -264,7 +264,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) @@ -383,7 +383,7 @@ def forward( >>> # TODO: Add full pretraining example ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.unispeech_sat( input_values, diff --git a/src/transformers/models/univnet/configuration_univnet.py b/src/transformers/models/univnet/configuration_univnet.py index 45133b7e5d39..f6ced84dc437 100644 --- a/src/transformers/models/univnet/configuration_univnet.py +++ b/src/transformers/models/univnet/configuration_univnet.py @@ -13,14 +13,14 @@ # limitations under the License. """UnivNetModel model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="dg845/univnet-dev") +@strict(accept_kwargs=True) class UnivNetConfig(PreTrainedConfig): r""" model_in_channels (`int`, *optional*, defaults to 64): @@ -76,41 +76,28 @@ class UnivNetConfig(PreTrainedConfig): model_type = "univnet" - def __init__( - self, - model_in_channels=64, - model_hidden_channels=32, - num_mel_bins=100, - resblock_kernel_sizes=[3, 3, 3], - resblock_stride_sizes=[8, 8, 4], - resblock_dilation_sizes=[[1, 3, 9, 27], [1, 3, 9, 27], [1, 3, 9, 27]], - kernel_predictor_num_blocks=3, - kernel_predictor_hidden_channels=64, - kernel_predictor_conv_size=3, - kernel_predictor_dropout=0.0, - initializer_range=0.01, - leaky_relu_slope=0.2, - **kwargs, - ): - if not (len(resblock_kernel_sizes) == len(resblock_stride_sizes) == len(resblock_dilation_sizes)): + model_in_channels: int = 64 + model_hidden_channels: int = 32 + num_mel_bins: int = 100 + resblock_kernel_sizes: list[int] | tuple[int, ...] = (3, 3, 3) + resblock_stride_sizes: list[int] | tuple[int, ...] = (8, 8, 4) + resblock_dilation_sizes: list | tuple = ((1, 3, 9, 27), (1, 3, 9, 27), (1, 3, 9, 27)) + kernel_predictor_num_blocks: int = 3 + kernel_predictor_hidden_channels: int = 64 + kernel_predictor_conv_size: int = 3 + kernel_predictor_dropout: float | int = 0.0 + initializer_range: float = 0.01 + leaky_relu_slope: float = 0.2 + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if not ( + len(self.resblock_kernel_sizes) == len(self.resblock_stride_sizes) == len(self.resblock_dilation_sizes) + ): raise ValueError( "`resblock_kernel_sizes`, `resblock_stride_sizes`, and `resblock_dilation_sizes` must all have the" " same length (which will be the number of resnet blocks in the model)." ) - self.model_in_channels = model_in_channels - self.model_hidden_channels = model_hidden_channels - self.num_mel_bins = num_mel_bins - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_stride_sizes = resblock_stride_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.kernel_predictor_num_blocks = kernel_predictor_num_blocks - self.kernel_predictor_hidden_channels = kernel_predictor_hidden_channels - self.kernel_predictor_conv_size = kernel_predictor_conv_size - self.kernel_predictor_dropout = kernel_predictor_dropout - self.initializer_range = initializer_range - self.leaky_relu_slope = leaky_relu_slope - super().__init__(**kwargs) - __all__ = ["UnivNetConfig"] diff --git a/src/transformers/models/univnet/modeling_univnet.py b/src/transformers/models/univnet/modeling_univnet.py index a7436eb7bd1e..a6b8eee85e7f 100644 --- a/src/transformers/models/univnet/modeling_univnet.py +++ b/src/transformers/models/univnet/modeling_univnet.py @@ -516,7 +516,7 @@ def forward( [1, 140288] ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Resolve batch sizes for noise_sequence and spectrogram spectrogram_batched = input_features.dim() == 3 diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py index a9e98f580e6f..519b64e1e5d0 100644 --- a/src/transformers/models/upernet/configuration_upernet.py +++ b/src/transformers/models/upernet/configuration_upernet.py @@ -13,16 +13,16 @@ # limitations under the License. """UperNet model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto.configuration_auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="openmmlab/upernet-convnext-tiny") +@strict(accept_kwargs=True) class UperNetConfig(PreTrainedConfig): r""" pool_scales (`tuple[int]`, *optional*, defaults to `[1, 2, 3, 6]`): @@ -60,43 +60,28 @@ class UperNetConfig(PreTrainedConfig): model_type = "upernet" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - backbone_config=None, - hidden_size=512, - initializer_range=0.02, - pool_scales=[1, 2, 3, 6], - use_auxiliary_head=True, - auxiliary_loss_weight=0.4, - auxiliary_in_channels=None, - auxiliary_channels=256, - auxiliary_num_convs=1, - auxiliary_concat_input=False, - loss_ignore_index=255, - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + hidden_size: int = 512 + initializer_range: float = 0.02 + pool_scales: list[int] | tuple[int, ...] = (1, 2, 3, 6) + use_auxiliary_head: bool = True + auxiliary_loss_weight: float = 0.4 + auxiliary_in_channels: int | None = None + auxiliary_channels: int = 256 + auxiliary_num_convs: int = 1 + auxiliary_concat_input: bool = False + loss_ignore_index: int = 255 + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="resnet", default_config_kwargs={ "out_features": ["stage1", "stage2", "stage3", "stage4"], }, **kwargs, ) - - self.backbone_config = backbone_config - self.hidden_size = hidden_size - self.initializer_range = initializer_range - self.pool_scales = pool_scales - self.use_auxiliary_head = use_auxiliary_head - self.auxiliary_loss_weight = auxiliary_loss_weight - self.auxiliary_in_channels = auxiliary_in_channels - self.auxiliary_channels = auxiliary_channels - self.auxiliary_num_convs = auxiliary_num_convs - self.auxiliary_concat_input = auxiliary_concat_input - self.loss_ignore_index = loss_ignore_index - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["UperNetConfig"] diff --git a/src/transformers/models/upernet/modeling_upernet.py b/src/transformers/models/upernet/modeling_upernet.py index bf497134646f..98dd697da205 100644 --- a/src/transformers/models/upernet/modeling_upernet.py +++ b/src/transformers/models/upernet/modeling_upernet.py @@ -330,7 +330,7 @@ def forward( if labels is not None and self.config.num_labels == 1: raise ValueError("The number of labels should be greater than one") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py index 379c1edf4449..8d410494b02f 100644 --- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py +++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py @@ -19,12 +19,15 @@ # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from huggingface_hub.dataclasses import strict + +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="google/vaultgemma-1b") +@strict(accept_kwargs=True) class VaultGemmaConfig(PreTrainedConfig): r""" query_pre_attn_scalar (`float`, *optional*, defaults to 256): @@ -63,67 +66,46 @@ class VaultGemmaConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = 30.0, - attn_logit_softcapping: float | None = 50.0, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.query_pre_attn_scalar = query_pre_attn_scalar - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.attn_logit_softcapping = attn_logit_softcapping - self.layer_types = layer_types + vocab_size: int = 256000 + hidden_size: int = 2304 + intermediate_size: int = 9216 + num_hidden_layers: int = 26 + num_attention_heads: int = 8 + num_key_value_heads: int = 4 + head_dim: int = 256 + hidden_activation: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 8192 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 1 + bos_token_id: int | None = 2 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + query_pre_attn_scalar: int = 256 + sliding_window: int | None = 4096 + layer_types: list[str] | None = None + final_logit_softcapping: float | None = 30.0 + attn_logit_softcapping: float | None = 50.0 + def __post_init__(self, **kwargs): if self.layer_types is None: self.layer_types = [ "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.rope_parameters = rope_parameters + super().__post_init__(**kwargs) - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) __all__ = ["VaultGemmaConfig"] diff --git a/src/transformers/models/vaultgemma/modeling_vaultgemma.py b/src/transformers/models/vaultgemma/modeling_vaultgemma.py index 1d8b05637ffe..62d9439aa66d 100644 --- a/src/transformers/models/vaultgemma/modeling_vaultgemma.py +++ b/src/transformers/models/vaultgemma/modeling_vaultgemma.py @@ -130,7 +130,7 @@ def eager_attention_forward( key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, - dropout: float = 0.0, + dropout: float | int = 0.0, scaling: float | None = None, softcap: float | None = None, **kwargs, diff --git a/src/transformers/models/vaultgemma/modular_vaultgemma.py b/src/transformers/models/vaultgemma/modular_vaultgemma.py index f6efdbc8f524..7a969e418133 100644 --- a/src/transformers/models/vaultgemma/modular_vaultgemma.py +++ b/src/transformers/models/vaultgemma/modular_vaultgemma.py @@ -14,73 +14,38 @@ import torch +from huggingface_hub.dataclasses import strict from ...cache_utils import Cache -from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring from ..gemma2.configuration_gemma2 import Gemma2Config from ..gemma2.modeling_gemma2 import Gemma2Attention, Gemma2DecoderLayer, Gemma2ForCausalLM, Gemma2MLP, Gemma2RMSNorm @auto_docstring(checkpoint="google/vaultgemma-1b") +@strict(accept_kwargs=True) class VaultGemmaConfig(Gemma2Config): - def __init__( - self, - vocab_size: int | None = 256000, - hidden_size: int | None = 2304, - intermediate_size: int | None = 9216, - num_hidden_layers: int | None = 26, - num_attention_heads: int | None = 8, - num_key_value_heads: int | None = 4, - head_dim: int | None = 256, - hidden_activation: str | None = "gelu_pytorch_tanh", - max_position_embeddings: int | None = 8192, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = 0, - eos_token_id: int | None = 1, - bos_token_id: int | None = 2, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - query_pre_attn_scalar: int | None = 256, - sliding_window: int | None = 4096, - layer_types: list[str] | None = None, - final_logit_softcapping: float | None = 30.0, - attn_logit_softcapping: float | None = 50.0, - **kwargs, - ): - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - head_dim=head_dim, - hidden_activation=hidden_activation, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - bos_token_id=bos_token_id, - tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - query_pre_attn_scalar=query_pre_attn_scalar, - sliding_window=sliding_window, - layer_types=layer_types, - final_logit_softcapping=final_logit_softcapping, - attn_logit_softcapping=attn_logit_softcapping, - **kwargs, - ) - - del self.use_bidirectional_attention + r""" + query_pre_attn_scalar (`float`, *optional*, defaults to 256): + scaling factor used on the attention scores + final_logit_softcapping (`float`, *optional*, defaults to 30.0): + scaling factor when applying tanh softcapping on the logits. + attn_logit_softcapping (`float`, *optional*, defaults to 50.0): + scaling factor when applying tanh softcapping on the attention scores. + use_bidirectional_attention (`bool`, *optional*): + If True, the model will attend to all text tokens instead of using a causal mask. + + ```python + >>> from transformers import VaultGemmaModel, VaultGemmaConfig + >>> # Initializing a VaultGemma vaultgemma-7b style configuration + >>> configuration = VaultGemmaConfig() + >>> # Initializing a model from the vaultgemma-7b style configuration + >>> model = VaultGemmaModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + use_bidirectional_attention = AttributeError() class VaultGemmaRMSNorm(Gemma2RMSNorm): diff --git a/src/transformers/models/vibevoice_acoustic_tokenizer/configuration_vibevoice_acoustic_tokenizer.py b/src/transformers/models/vibevoice_acoustic_tokenizer/configuration_vibevoice_acoustic_tokenizer.py index 826d798cb9c5..7c2f9df4d618 100644 --- a/src/transformers/models/vibevoice_acoustic_tokenizer/configuration_vibevoice_acoustic_tokenizer.py +++ b/src/transformers/models/vibevoice_acoustic_tokenizer/configuration_vibevoice_acoustic_tokenizer.py @@ -15,11 +15,14 @@ import math +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PretrainedConfig from ...utils import auto_docstring -@auto_docstring(checkpoint="microsoft/VibeVoice-AcousticTokenizer") +@auto_docstring(checkpoint="microsoft/VibeVoice-1.5B") +@strict(accept_kwargs=True) class VibeVoiceAcousticTokenizerConfig(PretrainedConfig): r""" channels (`int`, *optional*, defaults to 1): @@ -56,35 +59,18 @@ class VibeVoiceAcousticTokenizerConfig(PretrainedConfig): model_type = "vibevoice_acoustic_tokenizer" - def __init__( - self, - channels=1, - hidden_size=64, - kernel_size=7, - rms_norm_eps=1e-5, - layer_scale_init_value=1e-6, - initializer_range=1e-2, - num_filters=32, - downsampling_ratios=[2, 2, 4, 5, 5, 8], - depths=[3, 3, 3, 3, 3, 3, 8], - hidden_act="gelu", - ffn_expansion=4, - vae_std=0.625, - **kwargs, - ): - super().__init__(**kwargs) - self.channels = channels - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.kernel_size = kernel_size - self.rms_norm_eps = rms_norm_eps - self.layer_scale_init_value = layer_scale_init_value - self.ffn_expansion = ffn_expansion - self.initializer_range = initializer_range - self.num_filters = num_filters - self.downsampling_ratios = downsampling_ratios - self.depths = depths - self.vae_std = vae_std + channels: int = 1 + hidden_size: int = 64 + kernel_size: int = 7 + rms_norm_eps: float = 1e-5 + layer_scale_init_value: float = 1e-6 + initializer_range: float = 1e-2 + num_filters: int = 32 + downsampling_ratios: list[int] | tuple[int, ...] = (2, 2, 4, 5, 5, 8) + depths: list[int] | tuple[int, ...] = (3, 3, 3, 3, 3, 3, 8) + hidden_act: str = "gelu" + ffn_expansion: int = 4 + vae_std: float = 0.625 @property def hop_length(self): @@ -101,8 +87,28 @@ def decoder_config(self): return VibeVoiceAcousticTokenizerDecoderConfig(**config_dict) -@auto_docstring(checkpoint="microsoft/VibeVoice-AcousticTokenizer") +@auto_docstring(checkpoint="microsoft/VibeVoice-1.5B") +@strict(accept_kwargs=True) class VibeVoiceAcousticTokenizerEncoderConfig(VibeVoiceAcousticTokenizerConfig): + r""" + channels (`int`, *optional*, defaults to 1): + Number of input channels. + hidden_size (`int`, *optional*, defaults to 64): + Dimensionality of latent representations. + kernel_size (`int`, *optional*, defaults to 7): + Kernel size for convolutional layers. + num_filters (`int`, *optional*, defaults to 32): + Number of filters in initial convolutional layer, and doubles after each downsampling. + downsampling_ratios (`List[int]`, *optional*, defaults to `[2, 2, 4, 5, 5, 8]`): + Downsampling ratios for each layer. + depths (`List[int]`, *optional*, defaults to `[3, 3, 3, 3, 3, 3, 8]`): + Number of ConvNeXt blocks at each stage. + ffn_expansion (`int`, *optional*, defaults to 4): + Expansion factor for feed-forward networks. + vae_std (`float`, *optional*, defaults to 0.625): + Standard deviation used for VAE sampling after encoder. + """ + model_type = "vibevoice_acoustic_tokenizer_encoder" base_config_key = "encoder_config" @@ -110,13 +116,29 @@ class VibeVoiceAcousticTokenizerEncoderConfig(VibeVoiceAcousticTokenizerConfig): def encoder_config(self): return None - @property - def decoder_config(self): - return None - -@auto_docstring(checkpoint="microsoft/VibeVoice-AcousticTokenizer") +@auto_docstring(checkpoint="microsoft/VibeVoice-1.5B") +@strict(accept_kwargs=True) class VibeVoiceAcousticTokenizerDecoderConfig(VibeVoiceAcousticTokenizerConfig): + r""" + channels (`int`, *optional*, defaults to 1): + Number of input channels. + hidden_size (`int`, *optional*, defaults to 64): + Dimensionality of latent representations. + kernel_size (`int`, *optional*, defaults to 7): + Kernel size for convolutional layers. + num_filters (`int`, *optional*, defaults to 32): + Number of filters in initial convolutional layer, and doubles after each downsampling. + downsampling_ratios (`List[int]`, *optional*, defaults to `[2, 2, 4, 5, 5, 8]`): + Downsampling ratios for each layer. + depths (`List[int]`, *optional*, defaults to `[3, 3, 3, 3, 3, 3, 8]`): + Number of ConvNeXt blocks at each stage. + ffn_expansion (`int`, *optional*, defaults to 4): + Expansion factor for feed-forward networks. + vae_std (`float`, *optional*, defaults to 0.625): + Standard deviation used for VAE sampling after encoder. + """ + model_type = "vibevoice_acoustic_tokenizer_decoder" base_config_key = "decoder_config" diff --git a/src/transformers/models/video_llama_3/configuration_video_llama_3.py b/src/transformers/models/video_llama_3/configuration_video_llama_3.py index a47d8318b467..77333b257461 100644 --- a/src/transformers/models/video_llama_3/configuration_video_llama_3.py +++ b/src/transformers/models/video_llama_3/configuration_video_llama_3.py @@ -17,12 +17,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="lkhl/VideoLLaMA3-2B-Image-HF") +@strict(accept_kwargs=True) class VideoLlama3VisionConfig(PreTrainedConfig): r""" Example: @@ -43,75 +46,43 @@ class VideoLlama3VisionConfig(PreTrainedConfig): model_type = "video_llama_3_vision" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - patch_size=16, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - - self.initializer_range = initializer_range + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + num_channels: int = 3 + patch_size: int | list[int] | tuple[int, int] = 16 + hidden_act: str = "gelu_pytorch_tanh" + layer_norm_eps: float = 1e-6 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 @auto_docstring(checkpoint="lkhl/VideoLLaMA3-2B-Image-HF") +@strict(accept_kwargs=True) class VideoLlama3Config(PreTrainedConfig): model_type = "video_llama_3" sub_configs = {"vision_config": VideoLlama3VisionConfig, "text_config": AutoConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151655, - video_token_id=151656, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif isinstance(vision_config, PreTrainedConfig): - self.vision_config = vision_config - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151655 + video_token_id: int = 151656 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - else: - raise ValueError( - f"vision_config must be of type `dict` or `PreTrainedConfig`, but got {type(vision_config)}." - ) - - if isinstance(text_config, dict): - self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif isinstance(text_config, PreTrainedConfig): - self.text_config = text_config - elif text_config is None: - self.text_config = CONFIG_MAPPING["qwen2"]() - else: - raise ValueError(f"text_config must be of type `dict` or `PreTrainedConfig`, but got {type(text_config)}.") - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.tie_word_embeddings = tie_word_embeddings + if isinstance(self.text_config, dict): + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["qwen2"]() - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["VideoLlama3VisionConfig", "VideoLlama3Config"] diff --git a/src/transformers/models/video_llama_3/modular_video_llama_3.py b/src/transformers/models/video_llama_3/modular_video_llama_3.py index ced078d14b4d..d97bf605e7ce 100644 --- a/src/transformers/models/video_llama_3/modular_video_llama_3.py +++ b/src/transformers/models/video_llama_3/modular_video_llama_3.py @@ -19,6 +19,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub.dataclasses import strict from torch.nn import LayerNorm from ... import initialization as init @@ -90,81 +91,39 @@ @auto_docstring(checkpoint="lkhl/VideoLLaMA3-2B-Image-HF") +@strict(accept_kwargs=True) class VideoLlama3VisionConfig(SiglipVisionConfig): model_type = "video_llama_3_vision" base_config_key = "vision_config" - - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - patch_size=16, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - initializer_range=0.02, - **kwargs, - ): - super().__init__( - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_channels=num_channels, - patch_size=patch_size, - hidden_act=hidden_act, - layer_norm_eps=layer_norm_eps, - attention_dropout=attention_dropout, - **kwargs, - ) - - self.initializer_range = initializer_range - del self.image_size + image_size = AttributeError() + initializer_range: float = 0.02 @auto_docstring(checkpoint="lkhl/VideoLLaMA3-2B-Image-HF") +@strict(accept_kwargs=True) class VideoLlama3Config(PreTrainedConfig): model_type = "video_llama_3" sub_configs = {"vision_config": VideoLlama3VisionConfig, "text_config": AutoConfig} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - text_config=None, - vision_config=None, - image_token_id=151655, - video_token_id=151656, - tie_word_embeddings=False, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif isinstance(vision_config, PreTrainedConfig): - self.vision_config = vision_config - elif vision_config is None: + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + image_token_id: int = 151655 + video_token_id: int = 151656 + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): + if isinstance(self.vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**self.vision_config) + elif self.vision_config is None: self.vision_config = self.sub_configs["vision_config"]() - else: - raise ValueError( - f"vision_config must be of type `dict` or `PreTrainedConfig`, but got {type(vision_config)}." - ) - if isinstance(text_config, dict): - self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif isinstance(text_config, PreTrainedConfig): - self.text_config = text_config - elif text_config is None: + if isinstance(self.text_config, dict): + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: self.text_config = CONFIG_MAPPING["qwen2"]() - else: - raise ValueError(f"text_config must be of type `dict` or `PreTrainedConfig`, but got {type(text_config)}.") - - self.image_token_id = image_token_id - self.video_token_id = video_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + super().__post_init__(**kwargs) class VideoLlama3VisionRotaryEmbedding(VisionRotaryEmbedding): diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py index adef83b07f38..c3146ff0bfa1 100644 --- a/src/transformers/models/video_llava/configuration_video_llava.py +++ b/src/transformers/models/video_llava/configuration_video_llava.py @@ -12,6 +12,10 @@ # limitations under the License. """VideoLlava model configuration""" +from typing import Literal + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -21,6 +25,7 @@ @auto_docstring(checkpoint="LanguageBind/Video-LLaVA-7B-hf") +@strict(accept_kwargs=True) class VideoLlavaConfig(PreTrainedConfig): r""" Example: @@ -51,39 +56,25 @@ class VideoLlavaConfig(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_index=32000, - video_token_index=32001, - projector_hidden_act="gelu", - vision_feature_select_strategy="default", - vision_feature_layer=-2, - image_seq_length=256, - video_seq_length=2056, - multimodal_projector_bias=True, - tie_word_embeddings=False, - **kwargs, - ): - self.image_token_index = image_token_index - self.video_token_index = video_token_index - self.projector_hidden_act = projector_hidden_act - self.vision_feature_select_strategy = vision_feature_select_strategy - self.vision_feature_layer = vision_feature_layer - self.image_seq_length = image_seq_length - self.video_seq_length = video_seq_length - self.multimodal_projector_bias = multimodal_projector_bias - self.tie_word_embeddings = tie_word_embeddings - - self.vision_config = vision_config - + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 32000 + video_token_index: int = 32001 + projector_hidden_act: str = "gelu" + vision_feature_select_strategy: Literal["default", "full"] = "default" + vision_feature_layer: int | list[int] = -2 + image_seq_length: int = 256 + video_seq_length: int = 2056 + multimodal_projector_bias: bool = True + tie_word_embeddings: bool = False + + def __post_init__(self, **kwargs): if isinstance(self.vision_config, dict): - if "model_type" not in vision_config: - vision_config["model_type"] = "clip_vision_model" + if "model_type" not in self.vision_config: + self.vision_config["model_type"] = "clip_vision_model" logger.warning("Key=`model_type` not found in vision config, setting it to `clip_vision_model`") - self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: self.vision_config = CONFIG_MAPPING["clip_vision_model"]( intermediate_size=4096, hidden_size=1024, @@ -95,16 +86,15 @@ def __init__( projection_dim=768, ) - if isinstance(text_config, dict): - if "model_type" not in text_config: - text_config["model_type"] = "llama" + if isinstance(self.text_config, dict): + if "model_type" not in self.text_config: + self.text_config["model_type"] = "llama" logger.warning("Key=`model_type` not found in text config, setting it to `llama`") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["llama"]() + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["llama"]() - self.text_config = text_config - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["VideoLlavaConfig"] diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 65b7bef90d8c..3d4ea827f185 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -178,7 +178,7 @@ def set_input_embeddings(self, value): def get_image_features( self, pixel_values_images: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], @@ -227,7 +227,7 @@ def get_image_features( def get_video_features( self, pixel_values_videos: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -315,7 +315,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, @@ -412,7 +412,7 @@ def get_output_embeddings(self) -> nn.Module: def get_image_features( self, pixel_values_images: torch.FloatTensor, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: @@ -446,7 +446,7 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - vision_feature_layer: int | list[int] | None = None, + vision_feature_layer: int | list[int] | list[int] | None = None, vision_feature_select_strategy: str | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, diff --git a/src/transformers/models/videomae/configuration_videomae.py b/src/transformers/models/videomae/configuration_videomae.py index baf8547eadd5..4ea7c8e35c95 100644 --- a/src/transformers/models/videomae/configuration_videomae.py +++ b/src/transformers/models/videomae/configuration_videomae.py @@ -13,14 +13,14 @@ # limitations under the License. """VideoMAE model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="MCG-NJU/videomae-base") +@strict(accept_kwargs=True) class VideoMAEConfig(PreTrainedConfig): r""" num_frames (`int`, *optional*, defaults to 16): @@ -57,56 +57,27 @@ class VideoMAEConfig(PreTrainedConfig): model_type = "videomae" - def __init__( - self, - image_size=224, - patch_size=16, - num_channels=3, - num_frames=16, - tubelet_size=2, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - qkv_bias=True, - use_mean_pooling=True, - decoder_num_attention_heads=6, - decoder_hidden_size=384, - decoder_num_hidden_layers=4, - decoder_intermediate_size=1536, - norm_pix_loss=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.num_frames = num_frames - self.tubelet_size = tubelet_size - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.qkv_bias = qkv_bias - self.use_mean_pooling = use_mean_pooling - - self.decoder_num_attention_heads = decoder_num_attention_heads - self.decoder_hidden_size = decoder_hidden_size - self.decoder_num_hidden_layers = decoder_num_hidden_layers - self.decoder_intermediate_size = decoder_intermediate_size - self.norm_pix_loss = norm_pix_loss + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + num_frames: int = 16 + tubelet_size: int = 2 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + qkv_bias: bool = True + use_mean_pooling: bool = True + decoder_num_attention_heads: int = 6 + decoder_hidden_size: int = 384 + decoder_num_hidden_layers: int = 4 + decoder_intermediate_size: int = 1536 + norm_pix_loss: bool = True __all__ = ["VideoMAEConfig"] diff --git a/src/transformers/models/vilt/configuration_vilt.py b/src/transformers/models/vilt/configuration_vilt.py index d8f5feb7cc71..359213b3cc71 100644 --- a/src/transformers/models/vilt/configuration_vilt.py +++ b/src/transformers/models/vilt/configuration_vilt.py @@ -13,14 +13,14 @@ # limitations under the License. """VilT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="dandelin/vilt-b32-mlm") +@strict(accept_kwargs=True) class ViltConfig(PreTrainedConfig): r""" modality_type_vocab_size (`int`, *optional*, defaults to 2): @@ -51,57 +51,32 @@ class ViltConfig(PreTrainedConfig): model_type = "vilt" - def __init__( - self, - vocab_size=30522, - type_vocab_size=2, - modality_type_vocab_size=2, - max_position_embeddings=40, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - image_size=384, - patch_size=32, - num_channels=3, - qkv_bias=True, - max_image_length=-1, - tie_word_embeddings=True, - num_images=-1, - pad_token_id=None, - **kwargs, - ): - super().__init__(**kwargs) - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.vocab_size = vocab_size - self.type_vocab_size = type_vocab_size - self.modality_type_vocab_size = modality_type_vocab_size - self.max_position_embeddings = max_position_embeddings - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.max_image_length = max_image_length - self.num_images = num_images + vocab_size: int = 30522 + type_vocab_size: int = 2 + modality_type_vocab_size: int = 2 + max_position_embeddings: int = 40 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + image_size: int | list[int] | tuple[int, int] = 384 + patch_size: int | list[int] | tuple[int, int] = 32 + num_channels: int = 3 + qkv_bias: bool = True + max_image_length: int = -1 + tie_word_embeddings: bool = True + num_images: int = -1 + pad_token_id: int | None = None + + def __post_init__(self, **kwargs): + kwargs.pop("tie_word_embeddings", None) self.tie_word_embeddings = True # force it + super().__post_init__(**kwargs) __all__ = ["ViltConfig"] diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index bf3a6b463675..02b9a9f3f2eb 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -594,7 +594,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -776,7 +776,7 @@ def forward( >>> print(output) a bunch of cats laying on a couch. ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.vilt( input_ids, @@ -922,7 +922,7 @@ def forward( >>> print("Predicted answer:", model.config.id2label[idx]) Predicted answer: 2 ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.vilt( input_ids, @@ -1025,7 +1025,7 @@ def forward( ... outputs = model(**encoding) ... scores[text] = outputs.logits[0, :].item() ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict loss = None if labels is not None: @@ -1142,7 +1142,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is not None and pixel_values.ndim == 4: # add dummy num_images dimension @@ -1244,7 +1244,7 @@ def forward( Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.vilt( input_ids, diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py index d301ebb53c94..ffc68c4a3a75 100644 --- a/src/transformers/models/vipllava/configuration_vipllava.py +++ b/src/transformers/models/vipllava/configuration_vipllava.py @@ -12,15 +12,15 @@ # limitations under the License. """VipLlava model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="llava-hf/vip-llava-7b-hf") +@strict(accept_kwargs=True) class VipLlavaConfig(PreTrainedConfig): r""" projector_layernorm_eps (`float`, *optional*, defaults to 1e-05): @@ -55,30 +55,20 @@ class VipLlavaConfig(PreTrainedConfig): } sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} - def __init__( - self, - vision_config=None, - text_config=None, - image_token_index=32000, - projector_hidden_act="gelu", - projector_layernorm_eps=1e-5, - vision_feature_layers=[-2, -5, -8, -11, 6], - image_seq_length=576, - tie_word_embeddings=False, - **kwargs, - ): - self.image_token_index = image_token_index - self.projector_hidden_act = projector_hidden_act - self.projector_layernorm_eps = projector_layernorm_eps - self.vision_feature_layers = vision_feature_layers - self.image_seq_length = image_seq_length - self.vision_config = vision_config - self.tie_word_embeddings = tie_word_embeddings + vision_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + image_token_index: int = 32000 + projector_hidden_act: str = "gelu" + projector_layernorm_eps: float = 1e-5 + vision_feature_layers: int | list[int] | tuple[int, ...] = (-2, -5, -8, -11, 6) + image_seq_length: int = 576 + tie_word_embeddings: bool = False + def __post_init__(self, **kwargs): if isinstance(self.vision_config, dict): - vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model") - self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) - elif vision_config is None: + self.vision_config["model_type"] = self.vision_config.get("model_type", "clip_vision_model") + self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config) + elif self.vision_config is None: self.vision_config = CONFIG_MAPPING["clip_vision_model"]( intermediate_size=4096, hidden_size=1024, @@ -90,15 +80,13 @@ def __init__( projection_dim=768, ) - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["llama"]() - - self.text_config = text_config + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config) + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["llama"]() - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["VipLlavaConfig"] diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py index f3c234e9fe2e..501dcbe1a723 100644 --- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py @@ -13,6 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging from ..auto.configuration_auto import AutoConfig @@ -22,6 +25,7 @@ @auto_docstring +@strict(accept_kwargs=True) class VisionEncoderDecoderConfig(PreTrainedConfig): r""" kwargs (*optional*): @@ -64,8 +68,9 @@ class VisionEncoderDecoderConfig(PreTrainedConfig): sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} has_no_defaults_at_init = True - def __init__(self, **kwargs): - super().__init__(**kwargs) + is_encoder_decoder: bool = True + + def __post_init__(self, **kwargs): if "encoder" not in kwargs or "decoder" not in kwargs: raise ValueError( f"A configuration of type {self.model_type} cannot be instantiated because " @@ -79,7 +84,7 @@ def __init__(self, **kwargs): self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config) self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config) - self.is_encoder_decoder = True + super().__post_init__(**kwargs) @classmethod def from_encoder_decoder_configs( diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py index ac836c9856c6..44cb0f8ea0c6 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py @@ -373,7 +373,7 @@ def forward( >>> generated_ids = model.generate(pixel_values) >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")} diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py index e21a261fa113..392fec6cb7a3 100644 --- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py @@ -13,16 +13,16 @@ # limitations under the License. """VisionTextDualEncoder model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto.configuration_auto import AutoConfig from ..chinese_clip.configuration_chinese_clip import ChineseCLIPVisionConfig from ..clip.configuration_clip import CLIPVisionConfig from ..siglip.configuration_siglip import SiglipVisionConfig -logger = logging.get_logger(__name__) - VISION_MODEL_CONFIGS = { "clip_vision_model": CLIPVisionConfig, "chinese_clip_vision_model": ChineseCLIPVisionConfig, @@ -31,6 +31,7 @@ @auto_docstring +@strict(accept_kwargs=True) class VisionTextDualEncoderConfig(PreTrainedConfig): r""" Examples: @@ -63,15 +64,15 @@ class VisionTextDualEncoderConfig(PreTrainedConfig): sub_configs = {"vision_config": AutoConfig, "text_config": AutoConfig} has_no_defaults_at_init = True - def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs): - super().__init__(**kwargs) - - if "vision_config" not in kwargs: - raise ValueError("`vision_config` can not be `None`.") - - if "text_config" not in kwargs: - raise ValueError("`text_config` can not be `None`.") + projection_dim: int = 512 + logit_scale_init_value: int | float = 2.6592 + def __post_init__(self, **kwargs): + if "vision_config" not in kwargs or "text_config" not in kwargs: + raise ValueError( + f"A configuration of type {self.model_type} cannot be instantiated because not both `vision_config` and" + f" `text_config` sub-configurations are passed, but only {kwargs}" + ) vision_config = kwargs.pop("vision_config") text_config = kwargs.pop("text_config") @@ -85,11 +86,9 @@ def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs): self.vision_config = AutoConfig.for_model(vision_model_type, **vision_config) if hasattr(self.vision_config, "vision_config"): self.vision_config = self.vision_config.vision_config - self.text_config = AutoConfig.for_model(text_model_type, **text_config) - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value + super().__post_init__(**kwargs) @classmethod def from_vision_text_configs(cls, vision_config: PreTrainedConfig, text_config: PreTrainedConfig, **kwargs): diff --git a/src/transformers/models/visual_bert/configuration_visual_bert.py b/src/transformers/models/visual_bert/configuration_visual_bert.py index 4576dae9b596..66cc58d11a3a 100644 --- a/src/transformers/models/visual_bert/configuration_visual_bert.py +++ b/src/transformers/models/visual_bert/configuration_visual_bert.py @@ -13,14 +13,14 @@ # limitations under the License. """VisualBERT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="uclanlp/visualbert-vqa-coco-pre") +@strict(accept_kwargs=True) class VisualBertConfig(PreTrainedConfig): r""" visual_embedding_dim (`int`, *optional*, defaults to 512): @@ -51,50 +51,25 @@ class VisualBertConfig(PreTrainedConfig): model_type = "visual_bert" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - visual_embedding_dim=512, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - bypass_transformer=False, - special_visual_initialize=True, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.visual_embedding_dim = visual_embedding_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.type_vocab_size = type_vocab_size - self.layer_norm_eps = layer_norm_eps - self.bypass_transformer = bypass_transformer - self.special_visual_initialize = special_visual_initialize + vocab_size: int = 30522 + hidden_size: int = 768 + visual_embedding_dim: int = 512 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + bypass_transformer: bool = False + special_visual_initialize: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + tie_word_embeddings: bool = True __all__ = ["VisualBertConfig"] diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index eae34f165bd0..03f1abe59093 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -602,7 +602,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -799,7 +799,7 @@ def forward( prediction_logits = outputs.prediction_logits seq_relationship_logits = outputs.seq_relationship_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: total_size = attention_mask.size(-1) + visual_attention_mask.size(-1) @@ -965,7 +965,7 @@ def forward( loss = outputs.loss logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1119,7 +1119,7 @@ def forward( loss = outputs.loss scores = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict # Get the index of the last text token index_to_gather = attention_mask.sum(1) - 2 # as in original code @@ -1256,7 +1256,7 @@ def forward( loss = outputs.loss scores = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.visual_bert( input_ids, @@ -1439,7 +1439,7 @@ def forward( if region_to_phrase_position is None: raise ValueError("`region_to_phrase_position` should not be None when using Flickr Model.") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.visual_bert( input_ids, diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py index c41e96beba41..98138d8ac1db 100644 --- a/src/transformers/models/vit/configuration_vit.py +++ b/src/transformers/models/vit/configuration_vit.py @@ -13,14 +13,14 @@ # limitations under the License. """ViT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/vit-base-patch16-224") +@strict(accept_kwargs=True) class ViTConfig(PreTrainedConfig): r""" encoder_stride (`int`, *optional*, defaults to 16): @@ -47,44 +47,26 @@ class ViTConfig(PreTrainedConfig): model_type = "vit" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - encoder_stride=16, - pooler_output_size=None, - pooler_act="tanh", - **kwargs, - ): - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.encoder_stride = encoder_stride - self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size - self.pooler_act = pooler_act + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + encoder_stride: int = 16 + pooler_output_size: int | None = None + pooler_act: str = "tanh" - super().__init__(**kwargs) + def __post_init__(self, **kwargs): + self.pooler_output_size = self.pooler_output_size if self.pooler_output_size else self.hidden_size + super().__post_init__(**kwargs) __all__ = ["ViTConfig"] diff --git a/src/transformers/models/vit_mae/configuration_vit_mae.py b/src/transformers/models/vit_mae/configuration_vit_mae.py index b104993d4c40..9cb41f32ac13 100644 --- a/src/transformers/models/vit_mae/configuration_vit_mae.py +++ b/src/transformers/models/vit_mae/configuration_vit_mae.py @@ -13,14 +13,14 @@ # limitations under the License. """ViT MAE model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/vit-mae-base") +@strict(accept_kwargs=True) class ViTMAEConfig(PreTrainedConfig): r""" mask_ratio (`float`, *optional*, defaults to 0.75): @@ -48,50 +48,25 @@ class ViTMAEConfig(PreTrainedConfig): model_type = "vit_mae" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - decoder_num_attention_heads=16, - decoder_hidden_size=512, - decoder_num_hidden_layers=8, - decoder_intermediate_size=2048, - mask_ratio=0.75, - norm_pix_loss=False, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.decoder_num_attention_heads = decoder_num_attention_heads - self.decoder_hidden_size = decoder_hidden_size - self.decoder_num_hidden_layers = decoder_num_hidden_layers - self.decoder_intermediate_size = decoder_intermediate_size - self.mask_ratio = mask_ratio - self.norm_pix_loss = norm_pix_loss + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + decoder_num_attention_heads: int = 16 + decoder_hidden_size: int = 512 + decoder_num_hidden_layers: int = 8 + decoder_intermediate_size: int = 2048 + mask_ratio: float = 0.75 + norm_pix_loss: bool = False __all__ = ["ViTMAEConfig"] diff --git a/src/transformers/models/vit_msn/configuration_vit_msn.py b/src/transformers/models/vit_msn/configuration_vit_msn.py index 7a10fe439c9c..10def8c79029 100644 --- a/src/transformers/models/vit_msn/configuration_vit_msn.py +++ b/src/transformers/models/vit_msn/configuration_vit_msn.py @@ -13,14 +13,14 @@ # limitations under the License. """ViT MSN model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/vit_msn_base") +@strict(accept_kwargs=True) class ViTMSNConfig(PreTrainedConfig): r""" Example: @@ -40,38 +40,19 @@ class ViTMSNConfig(PreTrainedConfig): model_type = "vit_msn" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-06, - image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-06 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True __all__ = ["ViTMSNConfig"] diff --git a/src/transformers/models/vitdet/configuration_vitdet.py b/src/transformers/models/vitdet/configuration_vitdet.py index eabb6443a780..8fb95ec809b7 100644 --- a/src/transformers/models/vitdet/configuration_vitdet.py +++ b/src/transformers/models/vitdet/configuration_vitdet.py @@ -13,15 +13,15 @@ # limitations under the License. """VitDet model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="google/vitdet-base-patch16-224") +@strict(accept_kwargs=True) class VitDetConfig(BackboneConfigMixin, PreTrainedConfig): r""" pretrain_image_size (`int`, *optional*, defaults to 224): @@ -51,55 +51,34 @@ class VitDetConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "vitdet" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - mlp_ratio=4, - hidden_act="gelu", - dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-6, - image_size=224, - pretrain_image_size=224, - patch_size=16, - num_channels=3, - qkv_bias=True, - drop_path_rate=0.0, - window_block_indices=[], - residual_block_indices=[], - use_absolute_position_embeddings=True, - use_relative_position_embeddings=False, - window_size=0, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mlp_ratio = mlp_ratio - self.hidden_act = hidden_act - self.dropout_prob = dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.pretrain_image_size = pretrain_image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.drop_path_rate = drop_path_rate - self.window_block_indices = window_block_indices - self.residual_block_indices = residual_block_indices - self.use_absolute_position_embeddings = use_absolute_position_embeddings - self.use_relative_position_embeddings = use_relative_position_embeddings - self.window_size = window_size + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + mlp_ratio: int = 4 + hidden_act: str = "gelu" + dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-6 + image_size: int | list[int] | tuple[int, int] = 224 + pretrain_image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + drop_path_rate: float = 0.0 + window_block_indices: list[int] | tuple[int, ...] = () + residual_block_indices: list[int] | tuple[int, ...] = () + use_absolute_position_embeddings: bool = True + use_relative_position_embeddings: bool = False + window_size: int = 0 + _out_features: list[str] | None = None + _out_indices: list[int] | None = None + def __post_init__(self, **kwargs): self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["VitDetConfig"] diff --git a/src/transformers/models/vitdet/modeling_vitdet.py b/src/transformers/models/vitdet/modeling_vitdet.py index a7b0f2841f04..d46d700a9693 100644 --- a/src/transformers/models/vitdet/modeling_vitdet.py +++ b/src/transformers/models/vitdet/modeling_vitdet.py @@ -654,7 +654,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -728,7 +728,7 @@ def forward( >>> list(feature_maps[-1].shape) [1, 768, 14, 14] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py index c287d7952bf0..afdabad7303a 100644 --- a/src/transformers/models/vitmatte/configuration_vitmatte.py +++ b/src/transformers/models/vitmatte/configuration_vitmatte.py @@ -13,16 +13,16 @@ # limitations under the License. """VitMatte model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto.configuration_auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="hustvl/vitmatte-small-composition-1k") +@strict(accept_kwargs=True) class VitMatteConfig(PreTrainedConfig): r""" batch_norm_eps (`float`, *optional*, defaults to 1e-05): @@ -50,31 +50,21 @@ class VitMatteConfig(PreTrainedConfig): model_type = "vitmatte" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - backbone_config: PreTrainedConfig | None = None, - hidden_size: int = 384, - batch_norm_eps: float = 1e-5, - initializer_range: float = 0.02, - convstream_hidden_sizes: list[int] = [48, 96, 192], - fusion_hidden_sizes: list[int] = [256, 128, 64, 32], - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + hidden_size: int = 384 + batch_norm_eps: float = 1e-5 + initializer_range: float = 0.02 + convstream_hidden_sizes: list[int] | tuple[int, ...] = (48, 96, 192) + fusion_hidden_sizes: list[int] | tuple[int, ...] = (256, 128, 64, 32) + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="vitdet", default_config_kwargs={"out_features": ["stage4"]}, **kwargs, ) - - self.backbone_config = backbone_config - self.batch_norm_eps = batch_norm_eps - self.hidden_size = hidden_size - self.initializer_range = initializer_range - self.convstream_hidden_sizes = convstream_hidden_sizes - self.fusion_hidden_sizes = fusion_hidden_sizes - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["VitMatteConfig"] diff --git a/src/transformers/models/vitmatte/modeling_vitmatte.py b/src/transformers/models/vitmatte/modeling_vitmatte.py index 658d90e8aa85..63b6e33e0f7a 100644 --- a/src/transformers/models/vitmatte/modeling_vitmatte.py +++ b/src/transformers/models/vitmatte/modeling_vitmatte.py @@ -109,7 +109,7 @@ def __init__(self, config): if config.backbone_config is not None: in_channels = config.backbone_config.num_channels - out_channels = config.convstream_hidden_sizes + out_channels = list(config.convstream_hidden_sizes) self.convs = nn.ModuleList() self.conv_chans = [in_channels] + out_channels @@ -270,7 +270,7 @@ def forward( >>> print(alphas.shape) torch.Size([1, 1, 640, 960]) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py index 411c4ccc45a2..d4a0191793ba 100644 --- a/src/transformers/models/vitpose/configuration_vitpose.py +++ b/src/transformers/models/vitpose/configuration_vitpose.py @@ -13,16 +13,16 @@ # limitations under the License. """VitPose model configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto.configuration_auto import AutoConfig -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="usyd-community/vitpose-base-simple") +@strict(accept_kwargs=True) class VitPoseConfig(PreTrainedConfig): r""" scale_factor (`int`, *optional*, defaults to 4): @@ -48,27 +48,20 @@ class VitPoseConfig(PreTrainedConfig): model_type = "vitpose" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - backbone_config: PreTrainedConfig | None = None, - initializer_range: float = 0.02, - scale_factor: int = 4, - use_simple_decoder: bool = True, - **kwargs, - ): - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + initializer_range: float = 0.02 + scale_factor: int = 4 + use_simple_decoder: bool = True + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="vitpose_backbone", default_config_kwargs={"out_indices": [4]}, **kwargs, ) - self.backbone_config = backbone_config - self.initializer_range = initializer_range - self.scale_factor = scale_factor - self.use_simple_decoder = use_simple_decoder - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["VitPoseConfig"] diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py index e741fd19606d..dc6edc19e93e 100644 --- a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py @@ -13,15 +13,15 @@ # limitations under the License. """VitPose backbone configuration""" +from huggingface_hub.dataclasses import strict + from ...backbone_utils import BackboneConfigMixin from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - - -logger = logging.get_logger(__name__) +from ...utils import auto_docstring @auto_docstring(checkpoint="usyd-community/vitpose-base-simple") +@strict(accept_kwargs=True) class VitPoseBackboneConfig(BackboneConfigMixin, PreTrainedConfig): r""" part_features (`int`, *optional*): @@ -44,46 +44,30 @@ class VitPoseBackboneConfig(BackboneConfigMixin, PreTrainedConfig): model_type = "vitpose_backbone" - def __init__( - self, - image_size=[256, 192], - patch_size=[16, 16], - num_channels=3, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - mlp_ratio=4, - num_experts=1, - part_features=256, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - qkv_bias=True, - out_features=None, - out_indices=None, - **kwargs, - ): - super().__init__(**kwargs) + image_size: int | list[int] | tuple[int, ...] = (256, 192) + patch_size: int | list[int] | tuple[int, ...] = (16, 16) + num_channels: int = 3 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + mlp_ratio: int = 4 + num_experts: int = 1 + part_features: int = 256 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + qkv_bias: bool = True + _out_features: list[str] | None = None + _out_indices: list[int] | None = None - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mlp_ratio = mlp_ratio - self.num_experts = num_experts - self.part_features = part_features - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] - self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features) + def __post_init__(self, **kwargs): + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)] + self.set_output_features_output_indices( + out_indices=kwargs.pop("out_indices", None), out_features=kwargs.pop("out_features", None) + ) + super().__post_init__(**kwargs) __all__ = ["VitPoseBackboneConfig"] diff --git a/src/transformers/models/vits/configuration_vits.py b/src/transformers/models/vits/configuration_vits.py index 533371672f3a..b0f8566c411d 100644 --- a/src/transformers/models/vits/configuration_vits.py +++ b/src/transformers/models/vits/configuration_vits.py @@ -13,14 +13,14 @@ # limitations under the License. """VITS model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/mms-tts-eng") +@strict(accept_kwargs=True) class VitsConfig(PreTrainedConfig): r""" window_size (`int`, *optional*, defaults to 4): @@ -110,108 +110,59 @@ class VitsConfig(PreTrainedConfig): model_type = "vits" - def __init__( - self, - vocab_size=38, - hidden_size=192, - num_hidden_layers=6, - num_attention_heads=2, - window_size=4, - use_bias=True, - ffn_dim=768, - layerdrop=0.1, - ffn_kernel_size=3, - flow_size=192, - spectrogram_bins=513, - hidden_act="relu", - hidden_dropout=0.1, - attention_dropout=0.1, - activation_dropout=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_stochastic_duration_prediction=True, - num_speakers=1, - speaker_embedding_size=0, - upsample_initial_channel=512, - upsample_rates=[8, 8, 2, 2], - upsample_kernel_sizes=[16, 16, 4, 4], - resblock_kernel_sizes=[3, 7, 11], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], - leaky_relu_slope=0.1, - depth_separable_channels=2, - depth_separable_num_layers=3, - duration_predictor_flow_bins=10, - duration_predictor_tail_bound=5.0, - duration_predictor_kernel_size=3, - duration_predictor_dropout=0.5, - duration_predictor_num_flows=4, - duration_predictor_filter_channels=256, - prior_encoder_num_flows=4, - prior_encoder_num_wavenet_layers=4, - posterior_encoder_num_wavenet_layers=16, - wavenet_kernel_size=5, - wavenet_dilation_rate=1, - wavenet_dropout=0.0, - speaking_rate=1.0, - noise_scale=0.667, - noise_scale_duration=0.8, - sampling_rate=16_000, - pad_token_id=None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.window_size = window_size - self.use_bias = use_bias - self.ffn_dim = ffn_dim - self.layerdrop = layerdrop - self.ffn_kernel_size = ffn_kernel_size - self.flow_size = flow_size - self.spectrogram_bins = spectrogram_bins - self.hidden_act = hidden_act - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_stochastic_duration_prediction = use_stochastic_duration_prediction - self.num_speakers = num_speakers - self.speaker_embedding_size = speaker_embedding_size - self.upsample_initial_channel = upsample_initial_channel - self.upsample_rates = upsample_rates - self.upsample_kernel_sizes = upsample_kernel_sizes - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.leaky_relu_slope = leaky_relu_slope - self.depth_separable_channels = depth_separable_channels - self.depth_separable_num_layers = depth_separable_num_layers - self.duration_predictor_flow_bins = duration_predictor_flow_bins - self.duration_predictor_tail_bound = duration_predictor_tail_bound - self.duration_predictor_kernel_size = duration_predictor_kernel_size - self.duration_predictor_dropout = duration_predictor_dropout - self.duration_predictor_num_flows = duration_predictor_num_flows - self.duration_predictor_filter_channels = duration_predictor_filter_channels - self.prior_encoder_num_flows = prior_encoder_num_flows - self.prior_encoder_num_wavenet_layers = prior_encoder_num_wavenet_layers - self.posterior_encoder_num_wavenet_layers = posterior_encoder_num_wavenet_layers - self.wavenet_kernel_size = wavenet_kernel_size - self.wavenet_dilation_rate = wavenet_dilation_rate - self.wavenet_dropout = wavenet_dropout - self.speaking_rate = speaking_rate - self.noise_scale = noise_scale - self.noise_scale_duration = noise_scale_duration - self.sampling_rate = sampling_rate - self.pad_token_id = pad_token_id - - if len(upsample_kernel_sizes) != len(upsample_rates): + vocab_size: int = 38 + hidden_size: int = 192 + num_hidden_layers: int = 6 + num_attention_heads: int = 2 + window_size: int = 4 + use_bias: bool = True + ffn_dim: int = 768 + layerdrop: float | int = 0.1 + ffn_kernel_size: int = 3 + flow_size: int = 192 + spectrogram_bins: int = 513 + hidden_act: str = "relu" + hidden_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + use_stochastic_duration_prediction: bool = True + num_speakers: int = 1 + speaker_embedding_size: int = 0 + upsample_initial_channel: int = 512 + upsample_rates: list[int] | tuple[int, ...] = (8, 8, 2, 2) + upsample_kernel_sizes: list[int] | tuple[int, ...] = (16, 16, 4, 4) + resblock_kernel_sizes: list[int] | tuple[int, ...] = (3, 7, 11) + resblock_dilation_sizes: list | tuple = ((1, 3, 5), (1, 3, 5), (1, 3, 5)) + leaky_relu_slope: float = 0.1 + depth_separable_channels: int = 2 + depth_separable_num_layers: int = 3 + duration_predictor_flow_bins: int = 10 + duration_predictor_tail_bound: float = 5.0 + duration_predictor_kernel_size: int = 3 + duration_predictor_dropout: float | int = 0.5 + duration_predictor_num_flows: int = 4 + duration_predictor_filter_channels: int = 256 + prior_encoder_num_flows: int = 4 + prior_encoder_num_wavenet_layers: int = 4 + posterior_encoder_num_wavenet_layers: int = 16 + wavenet_kernel_size: int = 5 + wavenet_dilation_rate: int = 1 + wavenet_dropout: float | int = 0.0 + speaking_rate: float = 1.0 + noise_scale: float = 0.667 + noise_scale_duration: float = 0.8 + sampling_rate: int = 16_000 + pad_token_id: int | None = None + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if len(self.upsample_kernel_sizes) != len(self.upsample_rates): raise ValueError( - f"The length of `upsample_kernel_sizes` ({len(upsample_kernel_sizes)}) must match the length of " - f"`upsample_rates` ({len(upsample_rates)})" + f"The length of `upsample_kernel_sizes` ({len(self.upsample_kernel_sizes)}) must match the length of " + f"`upsample_rates` ({len(self.upsample_rates)})" ) - super().__init__(**kwargs) - __all__ = ["VitsConfig"] diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py index e3b5cd7f4c45..b8d318ca4e26 100644 --- a/src/transformers/models/vits/modeling_vits.py +++ b/src/transformers/models/vits/modeling_vits.py @@ -1312,7 +1312,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None: raise NotImplementedError("Training of VITS is not supported yet.") diff --git a/src/transformers/models/vivit/configuration_vivit.py b/src/transformers/models/vivit/configuration_vivit.py index 23ac1835f80c..fc639fe73ffb 100644 --- a/src/transformers/models/vivit/configuration_vivit.py +++ b/src/transformers/models/vivit/configuration_vivit.py @@ -13,14 +13,14 @@ # limitations under the License. """ViViT model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="google/vivit-b-16x2-kinetics400") +@strict(accept_kwargs=True) class VivitConfig(PreTrainedConfig): r""" num_frames (`int`, *optional*, defaults to 32): @@ -45,41 +45,20 @@ class VivitConfig(PreTrainedConfig): model_type = "vivit" - def __init__( - self, - image_size=224, - num_frames=32, - tubelet_size=[2, 16, 16], - num_channels=3, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu_fast", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-06, - qkv_bias=True, - **kwargs, - ): - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - - self.image_size = image_size - self.num_frames = num_frames - self.tubelet_size = tubelet_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - - super().__init__(**kwargs) + image_size: int | list[int] | tuple[int, int] = 224 + num_frames: int = 32 + tubelet_size: list[int] | tuple[int, ...] = (2, 16, 16) + num_channels: int = 3 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu_fast" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-06 + qkv_bias: bool = True __all__ = ["VivitConfig"] diff --git a/src/transformers/models/vjepa2/configuration_vjepa2.py b/src/transformers/models/vjepa2/configuration_vjepa2.py index cb35bedd7ed7..5da7552244e4 100644 --- a/src/transformers/models/vjepa2/configuration_vjepa2.py +++ b/src/transformers/models/vjepa2/configuration_vjepa2.py @@ -13,11 +13,14 @@ # limitations under the License. """VJEPA 2 model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/vjepa2-vitl-fpc64-256") +@strict(accept_kwargs=True) class VJEPA2Config(PreTrainedConfig): r""" crop_size (`int`, *optional*, defaults to 256): @@ -58,61 +61,29 @@ class VJEPA2Config(PreTrainedConfig): model_type = "vjepa2" - def __init__( - self, - patch_size=16, - crop_size=256, - frames_per_clip=64, - tubelet_size=2, - hidden_size=1024, - in_chans=3, - num_attention_heads=16, - num_hidden_layers=24, - drop_path_rate=0.0, - mlp_ratio=4.0, - layer_norm_eps=1e-6, - qkv_bias=True, - attention_probs_dropout_prob=0.0, - hidden_act="gelu", - initializer_range=0.02, - attention_dropout=0.0, - num_pooler_layers=3, - # predictor params - pred_hidden_size=384, - pred_num_attention_heads=12, - pred_num_hidden_layers=12, - pred_num_mask_tokens=10, - pred_zero_init_mask_tokens=True, - pred_mlp_ratio=4.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.crop_size = crop_size - self.frames_per_clip = frames_per_clip - self.patch_size = patch_size - self.tubelet_size = tubelet_size - self.hidden_size = hidden_size - self.in_chans = in_chans - self.num_attention_heads = num_attention_heads - self.num_hidden_layers = num_hidden_layers - self.drop_path_rate = drop_path_rate - self.mlp_ratio = mlp_ratio - self.layer_norm_eps = layer_norm_eps - self.qkv_bias = qkv_bias - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.image_size = crop_size - self.attention_dropout = attention_dropout - self.num_pooler_layers = num_pooler_layers - # predictor params - self.pred_hidden_size = pred_hidden_size - self.pred_num_attention_heads = pred_num_attention_heads - self.pred_num_hidden_layers = pred_num_hidden_layers - self.pred_num_mask_tokens = pred_num_mask_tokens - self.pred_zero_init_mask_tokens = pred_zero_init_mask_tokens - self.pred_mlp_ratio = pred_mlp_ratio + patch_size: int | list[int] | tuple[int, int] = 16 + crop_size: int = 256 + frames_per_clip: int = 64 + tubelet_size: int = 2 + hidden_size: int = 1024 + in_chans: int = 3 + num_attention_heads: int = 16 + num_hidden_layers: int = 24 + drop_path_rate: float = 0.0 + mlp_ratio: int | float = 4.0 + layer_norm_eps: float = 1e-6 + qkv_bias: bool = True + attention_probs_dropout_prob: float = 0.0 + hidden_act: str = "gelu" + initializer_range: float = 0.02 + attention_dropout: float | int = 0.0 + num_pooler_layers: int = 3 + pred_hidden_size: int = 384 + pred_num_attention_heads: int = 12 + pred_num_hidden_layers: int = 12 + pred_num_mask_tokens: int = 10 + pred_zero_init_mask_tokens: bool = True + pred_mlp_ratio: int | float = 4.0 __all__ = ["VJEPA2Config"] diff --git a/src/transformers/models/voxtral/configuration_voxtral.py b/src/transformers/models/voxtral/configuration_voxtral.py index 10dbe24c0186..ae08576db66d 100644 --- a/src/transformers/models/voxtral/configuration_voxtral.py +++ b/src/transformers/models/voxtral/configuration_voxtral.py @@ -12,12 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="mistralai/Voxtral-Mini-3B-2507") +@strict(accept_kwargs=True) class VoxtralEncoderConfig(PreTrainedConfig): r""" max_source_positions (`int`, *optional*, defaults to 1500): @@ -46,46 +50,28 @@ class VoxtralEncoderConfig(PreTrainedConfig): "encoder_layerdrop": "layerdrop", } - def __init__( - self, - vocab_size=51866, - hidden_size=1280, - intermediate_size=5120, - num_hidden_layers=32, - num_attention_heads=20, - scale_embedding=False, - activation_function="gelu", - num_mel_bins=128, - max_source_positions=1500, - initializer_range=0.02, - attention_dropout=0.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - - self.num_attention_heads = num_attention_heads - self.scale_embedding = scale_embedding # scale factor will be sqrt(hidden_size) if True - self.activation_function = activation_function - self.num_mel_bins = num_mel_bins - self.max_source_positions = max_source_positions - self.initializer_range = initializer_range - - # TODO: @eustlb, we do not use dropout and layerdrop, yet we need to hardcode them - # to be able to use Whisper with modular (here actually from Qwen2-Audio and copied from). - # After a future Whisper refactor, we should remove this. - self.dropout = 0.0 - self.layerdrop = 0.0 - self.activation_dropout = 0.0 - - self.attention_dropout = attention_dropout + vocab_size: int = 51866 + hidden_size: int = 1280 + intermediate_size: int = 5120 + num_hidden_layers: int = 32 + num_attention_heads: int = 20 + scale_embedding: bool = False + activation_function: str = "gelu" + num_mel_bins: int = 128 + max_source_positions: int = 1500 + initializer_range: float = 0.02 + attention_dropout: float | int = 0.0 + + # TODO: @eustlb, we do not use dropout and layerdrop, yet we need to hardcode them + # to be able to use Whisper with modular (here actually from Qwen2-Audio and copied from). + # After a future Whisper refactor, we should remove this. + dropout: float | int = 0.0 + layerdrop: float | int = 0.0 + activation_dropout: float | int = 0.0 @auto_docstring(checkpoint="mistralai/Voxtral-Mini-3B-2507") +@strict(accept_kwargs=True) class VoxtralConfig(PreTrainedConfig): r""" Example: @@ -119,35 +105,28 @@ class VoxtralConfig(PreTrainedConfig): "head_dim": 128, } - def __init__( - self, - audio_config=None, - text_config=None, - audio_token_id=None, - projector_hidden_act="gelu", - **kwargs, - ): - if isinstance(audio_config, dict): - audio_config["model_type"] = audio_config.get("model_type", "voxtral_encoder") - audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config) - elif audio_config is None: - audio_config = CONFIG_MAPPING["voxtral_encoder"]() - self.audio_config = audio_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "llama") - text_config = CONFIG_MAPPING[text_config["model_type"]]( - **{**self._default_text_config_kwargs, **text_config} + audio_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + audio_token_id: int | None = None + projector_hidden_act: str = "gelu" + + def __post_init__(self, **kwargs): + if isinstance(self.audio_config, dict): + self.audio_config["model_type"] = self.audio_config.get("model_type", "voxtral_encoder") + self.audio_config = CONFIG_MAPPING[self.audio_config["model_type"]](**self.audio_config) + elif self.audio_config is None: + self.audio_config = CONFIG_MAPPING["voxtral_encoder"]() + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "llama") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]]( + **{**self._default_text_config_kwargs, **self.text_config} ) - elif text_config is None: - text_config = CONFIG_MAPPING["llama"](**self._default_text_config_kwargs) - self.text_config = text_config - - self.hidden_size = text_config.hidden_size - self.audio_token_id = audio_token_id - self.projector_hidden_act = projector_hidden_act + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["llama"](**self._default_text_config_kwargs) - super().__init__(**kwargs) + self.hidden_size = self.text_config.hidden_size + super().__post_init__(**kwargs) __all__ = ["VoxtralEncoderConfig", "VoxtralConfig"] diff --git a/src/transformers/models/voxtral_realtime/configuration_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/configuration_voxtral_realtime.py index 59b6e95ee448..72d36e714f5a 100644 --- a/src/transformers/models/voxtral_realtime/configuration_voxtral_realtime.py +++ b/src/transformers/models/voxtral_realtime/configuration_voxtral_realtime.py @@ -12,8 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig from ..mistral.configuration_mistral import MistralConfig @@ -25,7 +28,8 @@ class VoxtralRealtimeTextConfig(MistralConfig): @auto_docstring(checkpoint="mistralai/Voxtral-Mini-4B-Realtime-2602") -class VoxtralRealtimeEncoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +@strict(accept_kwargs=True) +class VoxtralRealtimeEncoderConfig(PreTrainedConfig): r""" Example: @@ -50,49 +54,32 @@ class VoxtralRealtimeEncoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin) "encoder_attention_heads": "num_attention_heads", "encoder_ffn_dim": "intermediate_size", "encoder_layerdrop": "layerdrop", + "num_key_value_heads": "num_attention_heads", } - def __init__( - self, - vocab_size=131072, - hidden_size=1280, - intermediate_size=5120, - num_hidden_layers=32, - num_attention_heads=32, - activation_function="gelu", - num_mel_bins=128, - initializer_range=0.02, - attention_dropout=0.0, - hidden_act="silu", - max_position_embeddings=1500, - rms_norm_eps=1e-05, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - sliding_window=750, - head_dim=64, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - - self.num_attention_heads = num_attention_heads - self.activation_function = activation_function - self.num_mel_bins = num_mel_bins - self.initializer_range = initializer_range - self.num_key_value_heads = num_attention_heads - self.rms_norm_eps = rms_norm_eps - self.max_position_embeddings = max_position_embeddings - self.rope_parameters = rope_parameters - self.hidden_act = hidden_act - self.sliding_window = sliding_window - self.head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads - self.attention_dropout = attention_dropout - - super().__init__(**kwargs) + vocab_size: int = 131072 + hidden_size: int = 1280 + intermediate_size: int = 5120 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + activation_function: str = "gelu" + num_mel_bins: int = 128 + initializer_range: float = 0.02 + attention_dropout: float | int = 0.0 + hidden_act: str = "silu" + max_position_embeddings: int = 1500 + rms_norm_eps: float = 1e-05 + rope_parameters: RopeParameters | dict | None = None + sliding_window: int = 750 + head_dim: int = 64 + + def __post_init__(self, **kwargs): + self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads + super().__post_init__(**kwargs) @auto_docstring(checkpoint="mistralai/Voxtral-Mini-4B-Realtime-2602") +@strict(accept_kwargs=True) class VoxtralRealtimeConfig(PreTrainedConfig): r""" audio_length_per_tok (`int`, *optional*, defaults to 8): @@ -134,39 +121,30 @@ class VoxtralRealtimeConfig(PreTrainedConfig): "sliding_window": 8192, } - def __init__( - self, - audio_config=None, - text_config=None, - projector_hidden_act="gelu", - audio_length_per_tok=8, - default_num_delay_tokens=6, - downsample_factor=4, - **kwargs, - ): - if isinstance(audio_config, dict): - audio_config["model_type"] = audio_config.get("model_type", "voxtral_realtime_encoder") - audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config) - elif audio_config is None: - audio_config = CONFIG_MAPPING["voxtral_realtime_encoder"]() - self.audio_config = audio_config - - if isinstance(text_config, dict): - text_config["model_type"] = text_config.get("model_type", "voxtral_realtime_text") - text_config = CONFIG_MAPPING[text_config["model_type"]]( - **{**self._default_text_config_kwargs, **text_config} + audio_config: dict | PreTrainedConfig | None = None + text_config: dict | PreTrainedConfig | None = None + projector_hidden_act: str = "gelu" + audio_length_per_tok: int = 8 + default_num_delay_tokens: int = 6 + downsample_factor: int = 4 + + def __post_init__(self, **kwargs): + if isinstance(self.audio_config, dict): + self.audio_config["model_type"] = self.audio_config.get("model_type", "voxtral_realtime_encoder") + self.audio_config = CONFIG_MAPPING[self.audio_config["model_type"]](**self.audio_config) + elif self.audio_config is None: + self.audio_config = CONFIG_MAPPING["voxtral_realtime_encoder"]() + + if isinstance(self.text_config, dict): + self.text_config["model_type"] = self.text_config.get("model_type", "voxtral_realtime_text") + self.text_config = CONFIG_MAPPING[self.text_config["model_type"]]( + **{**self._default_text_config_kwargs, **self.text_config} ) - elif text_config is None: - text_config = CONFIG_MAPPING["voxtral_realtime_text"](**self._default_text_config_kwargs) - self.text_config = text_config - - self.hidden_size = text_config.hidden_size - self.projector_hidden_act = projector_hidden_act - self.audio_length_per_tok = audio_length_per_tok - self.default_num_delay_tokens = default_num_delay_tokens - self.downsample_factor = downsample_factor + elif self.text_config is None: + self.text_config = CONFIG_MAPPING["voxtral_realtime_text"](**self._default_text_config_kwargs) - super().__init__(**kwargs) + self.hidden_size = self.text_config.hidden_size + super().__post_init__(**kwargs) __all__ = ["VoxtralRealtimeEncoderConfig", "VoxtralRealtimeConfig", "VoxtralRealtimeTextConfig"] diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py index 79ab1b1f8222..ddbb63331f6d 100644 --- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py +++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py @@ -16,14 +16,14 @@ import functools import operator -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/wav2vec2-base-960h") +@strict(accept_kwargs=True) class Wav2Vec2Config(PreTrainedConfig): r""" num_codevectors_per_group (`int`, *optional*, defaults to 320): @@ -164,95 +164,69 @@ class Wav2Vec2Config(PreTrainedConfig): model_type = "wav2vec2" - def __init__( - self, - vocab_size=32, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - feat_proj_dropout=0.0, - feat_quantizer_dropout=0.0, - final_dropout=0.1, - layerdrop=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - feat_extract_norm="group", - feat_extract_activation="gelu", - conv_dim=(512, 512, 512, 512, 512, 512, 512), - conv_stride=(5, 2, 2, 2, 2, 2, 2), - conv_kernel=(10, 3, 3, 3, 3, 2, 2), - conv_bias=False, - num_conv_pos_embeddings=128, - num_conv_pos_embedding_groups=16, - do_stable_layer_norm=False, - apply_spec_augment=True, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - mask_feature_min_masks=0, - num_codevectors_per_group=320, - num_codevector_groups=2, - contrastive_logits_temperature=0.1, - num_negatives=100, - codevector_dim=256, - proj_codevector_dim=256, - diversity_loss_weight=0.1, - ctc_loss_reduction="sum", - ctc_zero_infinity=False, - use_weighted_layer_sum=False, - classifier_proj_size=256, - tdnn_dim=(512, 512, 512, 512, 1500), - tdnn_kernel=(5, 3, 3, 1, 1), - tdnn_dilation=(1, 2, 3, 1, 1), - xvector_output_dim=512, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - add_adapter=False, - adapter_kernel_size=3, - adapter_stride=2, - num_adapter_layers=3, - output_hidden_size=None, - adapter_attn_dim=None, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.hidden_size = hidden_size - self.feat_extract_norm = feat_extract_norm - self.feat_extract_activation = feat_extract_activation - self.conv_dim = list(conv_dim) - self.conv_stride = list(conv_stride) - self.conv_kernel = list(conv_kernel) - self.conv_bias = conv_bias - self.num_conv_pos_embeddings = num_conv_pos_embeddings - self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + vocab_size: int = 32 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + feat_proj_dropout: float | int = 0.0 + feat_quantizer_dropout: float | int = 0.0 + final_dropout: float | int = 0.1 + layerdrop: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + feat_extract_norm: str = "group" + feat_extract_activation: str = "gelu" + conv_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 512, 512, 512) + conv_stride: list[int] | tuple[int, ...] = (5, 2, 2, 2, 2, 2, 2) + conv_kernel: list[int] | tuple[int, ...] = (10, 3, 3, 3, 3, 2, 2) + conv_bias: bool = False + num_conv_pos_embeddings: int = 128 + num_conv_pos_embedding_groups: int = 16 + do_stable_layer_norm: bool = False + apply_spec_augment: bool = True + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + mask_feature_min_masks: int = 0 + num_codevectors_per_group: int = 320 + num_codevector_groups: int = 2 + contrastive_logits_temperature: float = 0.1 + num_negatives: int = 100 + codevector_dim: int = 256 + proj_codevector_dim: int = 256 + diversity_loss_weight: float = 0.1 + ctc_loss_reduction: str = "sum" + ctc_zero_infinity: bool = False + use_weighted_layer_sum: bool = False + classifier_proj_size: int = 256 + tdnn_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 1500) + tdnn_kernel: list[int] | tuple[int, ...] = (5, 3, 3, 1, 1) + tdnn_dilation: list[int] | tuple[int, ...] = (1, 2, 3, 1, 1) + xvector_output_dim: int = 512 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + add_adapter: bool = False + adapter_kernel_size: int = 3 + adapter_stride: int = 2 + num_adapter_layers: int = 3 + output_hidden_size: int | None = None + adapter_attn_dim: int | None = None + + def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_attention_heads = num_attention_heads - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.feat_proj_dropout = feat_proj_dropout - self.final_dropout = final_dropout - self.layerdrop = layerdrop - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.vocab_size = vocab_size - self.do_stable_layer_norm = do_stable_layer_norm - self.use_weighted_layer_sum = use_weighted_layer_sum + self.output_hidden_size = self.output_hidden_size or self.hidden_size + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if ( (len(self.conv_stride) != self.num_feat_extract_layers) or (len(self.conv_kernel) != self.num_feat_extract_layers) @@ -265,46 +239,6 @@ def __init__( f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." ) - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.apply_spec_augment = apply_spec_augment - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - self.mask_feature_min_masks = mask_feature_min_masks - - # parameters for pretraining with codevector quantized representations - self.num_codevectors_per_group = num_codevectors_per_group - self.num_codevector_groups = num_codevector_groups - self.contrastive_logits_temperature = contrastive_logits_temperature - self.feat_quantizer_dropout = feat_quantizer_dropout - self.num_negatives = num_negatives - self.codevector_dim = codevector_dim - self.proj_codevector_dim = proj_codevector_dim - self.diversity_loss_weight = diversity_loss_weight - - # ctc loss - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - - # adapter - self.add_adapter = add_adapter - self.adapter_kernel_size = adapter_kernel_size - self.adapter_stride = adapter_stride - self.num_adapter_layers = num_adapter_layers - self.output_hidden_size = output_hidden_size or hidden_size - self.adapter_attn_dim = adapter_attn_dim - - # SequenceClassification-specific parameter. Feel free to ignore for other classes. - self.classifier_proj_size = classifier_proj_size - - # XVector-specific parameters. Feel free to ignore for other classes. - self.tdnn_dim = list(tdnn_dim) - self.tdnn_kernel = list(tdnn_kernel) - self.tdnn_dilation = list(tdnn_dilation) - self.xvector_output_dim = xvector_output_dim - @property def inputs_to_logits_ratio(self): return functools.reduce(operator.mul, self.conv_stride, 1) diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index a947783d0217..561aa0ae3450 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1343,7 +1343,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) @@ -1503,7 +1503,7 @@ def forward( ... ).loss ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if mask_time_indices is not None: mask_time_indices = mask_time_indices.to(torch.bool) @@ -1689,7 +1689,7 @@ def forward( All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -1807,7 +1807,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2( @@ -1911,7 +1911,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2( @@ -2083,7 +2083,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2( diff --git a/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py index b63a769d252d..504122adb64f 100644 --- a/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py @@ -13,14 +13,16 @@ # limitations under the License. """Wav2Vec2Bert model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from typing import Literal +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/wav2vec2-bert-rel-pos-large") +@strict(accept_kwargs=True) class Wav2Vec2BertConfig(PreTrainedConfig): r""" feature_projection_input_dim (`int`, *optional*, defaults to 160): @@ -133,131 +135,62 @@ class Wav2Vec2BertConfig(PreTrainedConfig): model_type = "wav2vec2-bert" - def __init__( - self, - vocab_size=None, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - intermediate_size=4096, - feature_projection_input_dim=160, - hidden_act="swish", - hidden_dropout=0.0, - activation_dropout=0.0, - attention_dropout=0.0, - feat_proj_dropout=0.0, - final_dropout=0.1, - layerdrop=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - apply_spec_augment=True, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - mask_feature_min_masks=0, - ctc_loss_reduction="sum", - ctc_zero_infinity=False, - use_weighted_layer_sum=False, - classifier_proj_size=768, - tdnn_dim=(512, 512, 512, 512, 1500), - tdnn_kernel=(5, 3, 3, 1, 1), - tdnn_dilation=(1, 2, 3, 1, 1), - xvector_output_dim=512, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - add_adapter=False, - adapter_kernel_size=3, - adapter_stride=2, - num_adapter_layers=1, - adapter_act="relu", - use_intermediate_ffn_before_adapter=False, - output_hidden_size=None, - position_embeddings_type="relative_key", - rotary_embedding_base=10000, - max_source_positions=5000, - left_max_position_embeddings=64, - right_max_position_embeddings=8, - conv_depthwise_kernel_size=31, - conformer_conv_dropout=0.1, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_attention_heads = num_attention_heads - self.feature_projection_input_dim = feature_projection_input_dim - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.feat_proj_dropout = feat_proj_dropout - self.final_dropout = final_dropout - self.layerdrop = layerdrop - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.vocab_size = vocab_size - self.use_weighted_layer_sum = use_weighted_layer_sum - self.max_source_positions = max_source_positions - - if position_embeddings_type is not None and position_embeddings_type not in [ - "rotary", - "relative", - "relative_key", - ]: - raise ValueError( - """ - `position_embeddings_type` is not valid. It must be one of the following values: - `["rotary", "relative", "relative_key"]` or left as `None`. - """ - ) - self.position_embeddings_type = position_embeddings_type - self.rotary_embedding_base = rotary_embedding_base - self.left_max_position_embeddings = left_max_position_embeddings - self.right_max_position_embeddings = right_max_position_embeddings - - # Conformer-block related - self.conv_depthwise_kernel_size = conv_depthwise_kernel_size - self.conformer_conv_dropout = conformer_conv_dropout - - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.apply_spec_augment = apply_spec_augment - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - self.mask_feature_min_masks = mask_feature_min_masks - - # ctc loss - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - - # adapter - self.add_adapter = add_adapter - self.adapter_kernel_size = adapter_kernel_size - self.adapter_stride = adapter_stride - self.num_adapter_layers = num_adapter_layers - self.adapter_act = adapter_act - self.output_hidden_size = output_hidden_size if output_hidden_size is not None else hidden_size - if use_intermediate_ffn_before_adapter and not add_adapter: + vocab_size: int | None = None + hidden_size: int = 1024 + num_hidden_layers: int = 24 + num_attention_heads: int = 16 + intermediate_size: int = 4096 + feature_projection_input_dim: int = 160 + hidden_act: str = "swish" + hidden_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + feat_proj_dropout: float | int = 0.0 + final_dropout: float | int = 0.1 + layerdrop: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + apply_spec_augment: bool = True + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + mask_feature_min_masks: int = 0 + ctc_loss_reduction: str = "sum" + ctc_zero_infinity: bool = False + use_weighted_layer_sum: bool = False + classifier_proj_size: int = 768 + tdnn_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 1500) + tdnn_kernel: list[int] | tuple[int, ...] = (5, 3, 3, 1, 1) + tdnn_dilation: list[int] | tuple[int, ...] = (1, 2, 3, 1, 1) + xvector_output_dim: int = 512 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + add_adapter: bool = False + adapter_kernel_size: int = 3 + adapter_stride: int = 2 + num_adapter_layers: int = 1 + adapter_act: str = "relu" + use_intermediate_ffn_before_adapter: bool = False + output_hidden_size: int | None = None + position_embeddings_type: Literal["rotary", "relative", "relative_key"] | None = "relative_key" + rotary_embedding_base: int = 10000 + max_source_positions: int = 5000 + left_max_position_embeddings: int = 64 + right_max_position_embeddings: int = 8 + conv_depthwise_kernel_size: int = 31 + conformer_conv_dropout: float | int = 0.1 + + def __post_init__(self, **kwargs): + self.output_hidden_size = self.output_hidden_size or self.hidden_size + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.use_intermediate_ffn_before_adapter and not self.add_adapter: raise ValueError("`use_intermediate_ffn_before_adapter` is `True` but `add_adapter` is `False`.") - self.use_intermediate_ffn_before_adapter = use_intermediate_ffn_before_adapter - - # SequenceClassification-specific parameter. Feel free to ignore for other classes. - self.classifier_proj_size = classifier_proj_size - - # XVector-specific parameters. Feel free to ignore for other classes. - self.tdnn_dim = list(tdnn_dim) - self.tdnn_kernel = list(tdnn_kernel) - self.tdnn_dilation = list(tdnn_dilation) - self.xvector_output_dim = xvector_output_dim @property def inputs_to_logits_ratio(self): diff --git a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py index 09d6ea608137..6023d856798b 100644 --- a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py @@ -1011,7 +1011,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict hidden_states, extract_features = self.feature_projection(input_features) hidden_states = self._mask_hidden_states( @@ -1105,7 +1105,7 @@ def forward( if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.wav2vec2_bert( input_features, @@ -1209,7 +1209,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2_bert( @@ -1300,7 +1300,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2_bert( @@ -1459,7 +1459,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2_bert( diff --git a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py index 993072e51b27..710e7a64cea2 100644 --- a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py @@ -720,7 +720,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict hidden_states, extract_features = self.feature_projection(input_features) hidden_states = self._mask_hidden_states( @@ -788,7 +788,7 @@ def forward( if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.wav2vec2_bert( input_features, @@ -874,7 +874,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2_bert( @@ -945,7 +945,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2_bert( @@ -1007,7 +1007,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2_bert( diff --git a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py index c6d6aaed18fd..91ca1deb9491 100644 --- a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py @@ -16,14 +16,14 @@ import functools import operator -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/wav2vec2-conformer-rel-pos-large") +@strict(accept_kwargs=True) class Wav2Vec2ConformerConfig(PreTrainedConfig): r""" num_codevectors_per_group (`int`, *optional*, defaults to 320): @@ -165,100 +165,72 @@ class Wav2Vec2ConformerConfig(PreTrainedConfig): model_type = "wav2vec2-conformer" - def __init__( - self, - vocab_size=None, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - feat_proj_dropout=0.0, - feat_quantizer_dropout=0.0, - final_dropout=0.1, - layerdrop=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - feat_extract_norm="group", - feat_extract_activation="gelu", - conv_dim=(512, 512, 512, 512, 512, 512, 512), - conv_stride=(5, 2, 2, 2, 2, 2, 2), - conv_kernel=(10, 3, 3, 3, 3, 2, 2), - conv_bias=False, - num_conv_pos_embeddings=128, - num_conv_pos_embedding_groups=16, - apply_spec_augment=True, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - mask_feature_min_masks=0, - num_codevectors_per_group=320, - num_codevector_groups=2, - contrastive_logits_temperature=0.1, - num_negatives=100, - codevector_dim=256, - proj_codevector_dim=256, - diversity_loss_weight=0.1, - ctc_loss_reduction="sum", - ctc_zero_infinity=False, - use_weighted_layer_sum=False, - classifier_proj_size=256, - tdnn_dim=(512, 512, 512, 512, 1500), - tdnn_kernel=(5, 3, 3, 1, 1), - tdnn_dilation=(1, 2, 3, 1, 1), - xvector_output_dim=512, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - add_adapter=False, - adapter_kernel_size=3, - adapter_stride=2, - num_adapter_layers=3, - output_hidden_size=None, - position_embeddings_type="relative", - rotary_embedding_base=10000, - max_source_positions=5000, - conv_depthwise_kernel_size=31, - conformer_conv_dropout=0.1, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.hidden_size = hidden_size - self.feat_extract_norm = feat_extract_norm - self.feat_extract_activation = feat_extract_activation - self.conv_dim = list(conv_dim) - self.conv_stride = list(conv_stride) - self.conv_kernel = list(conv_kernel) - self.conv_bias = conv_bias - self.num_conv_pos_embeddings = num_conv_pos_embeddings - self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + vocab_size: int | None = None + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + feat_proj_dropout: float | int = 0.0 + feat_quantizer_dropout: float | int = 0.0 + final_dropout: float | int = 0.1 + layerdrop: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + feat_extract_norm: str = "group" + feat_extract_activation: str = "gelu" + conv_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 512, 512, 512) + conv_stride: list[int] | tuple[int, ...] = (5, 2, 2, 2, 2, 2, 2) + conv_kernel: list[int] | tuple[int, ...] = (10, 3, 3, 3, 3, 2, 2) + conv_bias: bool = False + num_conv_pos_embeddings: int = 128 + num_conv_pos_embedding_groups: int = 16 + apply_spec_augment: bool = True + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + mask_feature_min_masks: int = 0 + num_codevectors_per_group: int = 320 + num_codevector_groups: int = 2 + contrastive_logits_temperature: float = 0.1 + num_negatives: int = 100 + codevector_dim: int = 256 + proj_codevector_dim: int = 256 + diversity_loss_weight: float = 0.1 + ctc_loss_reduction: str = "sum" + ctc_zero_infinity: bool = False + use_weighted_layer_sum: bool = False + classifier_proj_size: int = 256 + tdnn_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 1500) + tdnn_kernel: list[int] | tuple[int, ...] = (5, 3, 3, 1, 1) + tdnn_dilation: list[int] | tuple[int, ...] = (1, 2, 3, 1, 1) + xvector_output_dim: int = 512 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + add_adapter: bool = False + adapter_kernel_size: int = 3 + adapter_stride: int = 2 + num_adapter_layers: int = 3 + output_hidden_size: int | None = None + position_embeddings_type: str | None = "relative" + rotary_embedding_base: int = 10000 + max_source_positions: int = 5000 + conv_depthwise_kernel_size: int = 31 + conformer_conv_dropout: float | int = 0.1 + + def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_attention_heads = num_attention_heads - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.feat_proj_dropout = feat_proj_dropout - self.final_dropout = final_dropout - self.layerdrop = layerdrop - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.vocab_size = vocab_size - self.use_weighted_layer_sum = use_weighted_layer_sum - self.max_source_positions = max_source_positions - self.position_embeddings_type = position_embeddings_type - self.rotary_embedding_base = rotary_embedding_base + self.output_hidden_size = self.output_hidden_size or self.hidden_size + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if ( (len(self.conv_stride) != self.num_feat_extract_layers) or (len(self.conv_kernel) != self.num_feat_extract_layers) @@ -271,49 +243,6 @@ def __init__( f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." ) - # Conformer-block related - self.conv_depthwise_kernel_size = conv_depthwise_kernel_size - self.conformer_conv_dropout = conformer_conv_dropout - - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.apply_spec_augment = apply_spec_augment - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - self.mask_feature_min_masks = mask_feature_min_masks - - # parameters for pretraining with codevector quantized representations - self.num_codevectors_per_group = num_codevectors_per_group - self.num_codevector_groups = num_codevector_groups - self.contrastive_logits_temperature = contrastive_logits_temperature - self.feat_quantizer_dropout = feat_quantizer_dropout - self.num_negatives = num_negatives - self.codevector_dim = codevector_dim - self.proj_codevector_dim = proj_codevector_dim - self.diversity_loss_weight = diversity_loss_weight - - # ctc loss - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - - # adapter - self.add_adapter = add_adapter - self.adapter_kernel_size = adapter_kernel_size - self.adapter_stride = adapter_stride - self.num_adapter_layers = num_adapter_layers - self.output_hidden_size = output_hidden_size or hidden_size - - # SequenceClassification-specific parameter. Feel free to ignore for other classes. - self.classifier_proj_size = classifier_proj_size - - # XVector-specific parameters. Feel free to ignore for other classes. - self.tdnn_dim = list(tdnn_dim) - self.tdnn_kernel = list(tdnn_kernel) - self.tdnn_dilation = list(tdnn_dilation) - self.xvector_output_dim = xvector_output_dim - @property def inputs_to_logits_ratio(self): return functools.reduce(operator.mul, self.conv_stride, 1) diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index f9097275b36f..354146cedb55 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -1160,7 +1160,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) @@ -1320,7 +1320,7 @@ def forward( ... ).loss ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if mask_time_indices is not None: mask_time_indices = mask_time_indices.to(torch.bool) @@ -1477,7 +1477,7 @@ def forward( All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -1595,7 +1595,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2_conformer( @@ -1699,7 +1699,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2_conformer( @@ -1871,7 +1871,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wav2vec2_conformer( diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py index fd3396ea7c32..124820082a6f 100644 --- a/src/transformers/models/wavlm/configuration_wavlm.py +++ b/src/transformers/models/wavlm/configuration_wavlm.py @@ -16,14 +16,14 @@ import functools import operator -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="microsoft/wavlm-base") +@strict(accept_kwargs=True) class WavLMConfig(PreTrainedConfig): r""" final_dropout (`float`, *optional*, defaults to 0.1): @@ -156,99 +156,69 @@ class WavLMConfig(PreTrainedConfig): model_type = "wavlm" - def __init__( - self, - vocab_size=32, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout=0.1, - activation_dropout=0.1, - attention_dropout=0.1, - feat_proj_dropout=0.0, - final_dropout=0.1, - layerdrop=0.1, - initializer_range=0.02, - layer_norm_eps=1e-5, - feat_extract_norm="group", - feat_extract_activation="gelu", - conv_dim=(512, 512, 512, 512, 512, 512, 512), - conv_stride=(5, 2, 2, 2, 2, 2, 2), - conv_kernel=(10, 3, 3, 3, 3, 2, 2), - conv_bias=False, - num_conv_pos_embeddings=128, - num_conv_pos_embedding_groups=16, - num_buckets=320, - max_bucket_distance=800, - do_stable_layer_norm=False, - apply_spec_augment=True, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - num_codevectors_per_group=320, - num_codevector_groups=2, - contrastive_logits_temperature=0.1, - num_negatives=100, - codevector_dim=256, - proj_codevector_dim=256, - diversity_loss_weight=0.1, - ctc_loss_reduction="mean", - ctc_zero_infinity=False, - use_weighted_layer_sum=False, - classifier_proj_size=256, - tdnn_dim=(512, 512, 512, 512, 1500), - tdnn_kernel=(5, 3, 3, 1, 1), - tdnn_dilation=(1, 2, 3, 1, 1), - xvector_output_dim=512, - num_ctc_classes=80, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - add_adapter=False, - adapter_kernel_size=3, - adapter_stride=2, - num_adapter_layers=3, - output_hidden_size=None, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.hidden_size = hidden_size - self.feat_extract_norm = feat_extract_norm - self.feat_extract_activation = feat_extract_activation - self.conv_dim = list(conv_dim) - self.conv_stride = list(conv_stride) - self.conv_kernel = list(conv_kernel) - self.conv_bias = conv_bias - self.num_buckets = num_buckets - self.max_bucket_distance = max_bucket_distance - self.num_conv_pos_embeddings = num_conv_pos_embeddings - self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + vocab_size: int = 32 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout: float | int = 0.1 + activation_dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + feat_proj_dropout: float | int = 0.0 + final_dropout: float | int = 0.1 + layerdrop: float | int = 0.1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-5 + feat_extract_norm: str = "group" + feat_extract_activation: str = "gelu" + conv_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 512, 512, 512) + conv_stride: list[int] | tuple[int, ...] = (5, 2, 2, 2, 2, 2, 2) + conv_kernel: list[int] | tuple[int, ...] = (10, 3, 3, 3, 3, 2, 2) + conv_bias: bool = False + num_conv_pos_embeddings: int = 128 + num_conv_pos_embedding_groups: int = 16 + num_buckets: int = 320 + max_bucket_distance: int = 800 + do_stable_layer_norm: bool = False + apply_spec_augment: bool = True + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + num_codevectors_per_group: int = 320 + num_codevector_groups: int = 2 + contrastive_logits_temperature: float = 0.1 + num_negatives: int = 100 + codevector_dim: int = 256 + proj_codevector_dim: int = 256 + diversity_loss_weight: float = 0.1 + ctc_loss_reduction: str = "mean" + ctc_zero_infinity: bool = False + use_weighted_layer_sum: bool = False + classifier_proj_size: int = 256 + tdnn_dim: list[int] | tuple[int, ...] = (512, 512, 512, 512, 1500) + tdnn_kernel: list[int] | tuple[int, ...] = (5, 3, 3, 1, 1) + tdnn_dilation: list[int] | tuple[int, ...] = (1, 2, 3, 1, 1) + xvector_output_dim: int = 512 + num_ctc_classes: int = 80 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + add_adapter: bool = False + adapter_kernel_size: int = 3 + adapter_stride: int = 2 + num_adapter_layers: int = 3 + output_hidden_size: int | None = None + + def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) - self.num_hidden_layers = num_hidden_layers - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_attention_heads = num_attention_heads - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.feat_proj_dropout = feat_proj_dropout - self.final_dropout = final_dropout - self.layerdrop = layerdrop - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.num_ctc_classes = num_ctc_classes - self.vocab_size = vocab_size - self.do_stable_layer_norm = do_stable_layer_norm - self.use_weighted_layer_sum = use_weighted_layer_sum - self.classifier_proj_size = classifier_proj_size + self.output_hidden_size = self.output_hidden_size or self.hidden_size + super().__post_init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" if ( (len(self.conv_stride) != self.num_feat_extract_layers) or (len(self.conv_kernel) != self.num_feat_extract_layers) @@ -261,43 +231,6 @@ def __init__( f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." ) - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.apply_spec_augment = apply_spec_augment - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - - # parameters for pretraining with codevector quantized representations - self.num_codevectors_per_group = num_codevectors_per_group - self.num_codevector_groups = num_codevector_groups - self.contrastive_logits_temperature = contrastive_logits_temperature - self.num_negatives = num_negatives - self.codevector_dim = codevector_dim - self.proj_codevector_dim = proj_codevector_dim - self.diversity_loss_weight = diversity_loss_weight - - # ctc loss - self.ctc_loss_reduction = ctc_loss_reduction - self.ctc_zero_infinity = ctc_zero_infinity - - # adapter - self.add_adapter = add_adapter - self.adapter_kernel_size = adapter_kernel_size - self.adapter_stride = adapter_stride - self.num_adapter_layers = num_adapter_layers - self.output_hidden_size = output_hidden_size or hidden_size - - # SequenceClassification-specific parameter. Feel free to ignore for other classes. - self.classifier_proj_size = classifier_proj_size - - # XVector-specific parameters. Feel free to ignore for other classes. - self.tdnn_dim = list(tdnn_dim) - self.tdnn_kernel = list(tdnn_kernel) - self.tdnn_dilation = list(tdnn_dilation) - self.xvector_output_dim = xvector_output_dim - @property def inputs_to_logits_ratio(self): return functools.reduce(operator.mul, self.conv_stride, 1) diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index 63aab3ef41e1..18440ebf7d25 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -112,7 +112,7 @@ def __init__( self, embed_dim: int, num_heads: int, - dropout: float = 0.0, + dropout: float | int = 0.0, num_buckets: int = 320, max_distance: int = 800, has_relative_position_bias: bool = True, @@ -1055,7 +1055,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict extract_features = self.feature_extractor(input_values) extract_features = extract_features.transpose(1, 2) @@ -1190,7 +1190,7 @@ def forward( All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -1308,7 +1308,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wavlm( @@ -1412,7 +1412,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wavlm( @@ -1584,7 +1584,7 @@ def forward( `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.wavlm( diff --git a/src/transformers/models/wavlm/modular_wavlm.py b/src/transformers/models/wavlm/modular_wavlm.py index 1b17adbaef81..b3329e64913d 100644 --- a/src/transformers/models/wavlm/modular_wavlm.py +++ b/src/transformers/models/wavlm/modular_wavlm.py @@ -43,7 +43,7 @@ def __init__( self, embed_dim: int, num_heads: int, - dropout: float = 0.0, + dropout: float | int = 0.0, num_buckets: int = 320, max_distance: int = 800, has_relative_position_bias: bool = True, diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py index b393e94fd70d..dd205ac36bec 100644 --- a/src/transformers/models/whisper/configuration_whisper.py +++ b/src/transformers/models/whisper/configuration_whisper.py @@ -13,11 +13,10 @@ # limitations under the License. """Whisper model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring # fmt: off @@ -47,6 +46,7 @@ @auto_docstring(checkpoint="openai/whisper-tiny") +@strict(accept_kwargs=True) class WhisperConfig(PreTrainedConfig): r""" max_source_positions (`int`, *optional*, defaults to 1500): @@ -121,98 +121,47 @@ class WhisperConfig(PreTrainedConfig): "num_key_value_heads": "encoder_attention_heads", "num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model", + "num_hidden_layers": "encoder_layers", } - def __init__( - self, - vocab_size=51865, - num_mel_bins=80, - encoder_layers=4, - encoder_attention_heads=6, - decoder_layers=4, - decoder_attention_heads=6, - decoder_ffn_dim=1536, - encoder_ffn_dim=1536, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - decoder_start_token_id=50257, - use_cache=True, - is_encoder_decoder=True, - activation_function="gelu", - d_model=384, - dropout=0.0, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - scale_embedding=False, - max_source_positions=1500, - max_target_positions=448, - pad_token_id=50256, - bos_token_id=50256, - eos_token_id=50256, - suppress_tokens=None, - begin_suppress_tokens=[220, 50256], - use_weighted_layer_sum=False, - classifier_proj_size=256, - apply_spec_augment=False, - mask_time_prob=0.05, - mask_time_length=10, - mask_time_min_masks=2, - mask_feature_prob=0.0, - mask_feature_length=10, - mask_feature_min_masks=0, - median_filter_width=7, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.num_mel_bins = num_mel_bins - self.d_model = d_model - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.encoder_ffn_dim = encoder_ffn_dim - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.max_source_positions = max_source_positions - self.max_target_positions = max_target_positions - - # Audio Classification-specific parameters. Feel free to ignore for other classes. - self.classifier_proj_size = classifier_proj_size - self.use_weighted_layer_sum = use_weighted_layer_sum - - # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 - self.apply_spec_augment = apply_spec_augment - self.mask_time_prob = mask_time_prob - self.mask_time_length = mask_time_length - self.mask_time_min_masks = mask_time_min_masks - self.mask_feature_prob = mask_feature_prob - self.mask_feature_length = mask_feature_length - self.mask_feature_min_masks = mask_feature_min_masks - - self.median_filter_width = median_filter_width - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.decoder_start_token_id = decoder_start_token_id - self.tie_word_embeddings = tie_word_embeddings - - super().__init__( - is_encoder_decoder=is_encoder_decoder, - suppress_tokens=suppress_tokens, - begin_suppress_tokens=begin_suppress_tokens, - **kwargs, - ) + vocab_size: int = 51865 + num_mel_bins: int = 80 + encoder_layers: int = 4 + encoder_attention_heads: int = 6 + decoder_layers: int = 4 + decoder_attention_heads: int = 6 + decoder_ffn_dim: int = 1536 + encoder_ffn_dim: int = 1536 + encoder_layerdrop: float | int = 0.0 + decoder_layerdrop: float | int = 0.0 + decoder_start_token_id: int = 50257 + use_cache: bool = True + is_encoder_decoder: bool = True + activation_function: str = "gelu" + d_model: int = 384 + dropout: float | int = 0.0 + attention_dropout: float | int = 0.0 + activation_dropout: float | int = 0.0 + init_std: float = 0.02 + scale_embedding: bool = False + max_source_positions: int = 1500 + max_target_positions: int = 448 + pad_token_id: int | None = 50256 + bos_token_id: int | None = 50256 + eos_token_id: int | None = 50256 + suppress_tokens: list | None = None + begin_suppress_tokens: list[int] | tuple[int, ...] | None = (220, 50256) + use_weighted_layer_sum: bool = False + classifier_proj_size: int = 256 + apply_spec_augment: bool = False + mask_time_prob: float = 0.05 + mask_time_length: int = 10 + mask_time_min_masks: int = 2 + mask_feature_prob: float = 0.0 + mask_feature_length: int = 10 + mask_feature_min_masks: int = 0 + median_filter_width: int = 7 + tie_word_embeddings: bool = True __all__ = ["WhisperConfig"] diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py index 53ded305becc..6a25787a1000 100644 --- a/src/transformers/models/x_clip/configuration_x_clip.py +++ b/src/transformers/models/x_clip/configuration_x_clip.py @@ -13,6 +13,8 @@ # limitations under the License. """X-CLIP model configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -21,6 +23,7 @@ @auto_docstring(checkpoint="microsoft/xclip-base-patch32") +@strict(accept_kwargs=True) class XCLIPTextConfig(PreTrainedConfig): r""" Example: @@ -41,43 +44,24 @@ class XCLIPTextConfig(PreTrainedConfig): model_type = "xclip_text_model" base_config_key = "text_config" - def __init__( - self, - vocab_size=49408, - hidden_size=512, - intermediate_size=2048, - num_hidden_layers=12, - num_attention_heads=8, - max_position_embeddings=77, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout + vocab_size: int = 49408 + hidden_size: int = 512 + intermediate_size: int = 2048 + num_hidden_layers: int = 12 + num_attention_heads: int = 8 + max_position_embeddings: int = 77 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 @auto_docstring(checkpoint="microsoft/xclip-base-patch32") +@strict(accept_kwargs=True) class XCLIPVisionConfig(PreTrainedConfig): r""" num_frames (`int`, *optional*, defaults to 8): @@ -110,51 +94,28 @@ class XCLIPVisionConfig(PreTrainedConfig): model_type = "xclip_vision_model" base_config_key = "vision_config" - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - mit_hidden_size=512, - mit_intermediate_size=2048, - mit_num_hidden_layers=1, - mit_num_attention_heads=8, - num_channels=3, - image_size=224, - patch_size=32, - num_frames=8, - hidden_act="quick_gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - drop_path_rate=0.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mit_hidden_size = mit_hidden_size - self.mit_intermediate_size = mit_intermediate_size - self.mit_num_hidden_layers = mit_num_hidden_layers - self.mit_num_attention_heads = mit_num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.num_frames = num_frames - self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.drop_path_rate = drop_path_rate + hidden_size: int = 768 + intermediate_size: int = 3072 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + mit_hidden_size: int = 512 + mit_intermediate_size: int = 2048 + mit_num_hidden_layers: int = 1 + mit_num_attention_heads: int = 8 + num_channels: int = 3 + image_size: int | list[int] | tuple[int, int] = 224 + patch_size: int | list[int] | tuple[int, int] = 32 + num_frames: int = 8 + hidden_act: str = "quick_gelu" + layer_norm_eps: float = 1e-5 + attention_dropout: float | int = 0.0 + initializer_range: float = 0.02 + initializer_factor: float = 1.0 + drop_path_rate: float = 0.0 @auto_docstring(checkpoint="microsoft/xclip-base-patch32") +@strict(accept_kwargs=True) class XCLIPConfig(PreTrainedConfig): r""" prompt_layers (`int`, *optional*, defaults to 2): @@ -175,33 +136,43 @@ class XCLIPConfig(PreTrainedConfig): model_type = "xclip" sub_configs = {"text_config": XCLIPTextConfig, "vision_config": XCLIPVisionConfig} - def __init__( - self, - text_config=None, - vision_config=None, - projection_dim=512, - prompt_layers=2, - prompt_alpha=0.1, - prompt_hidden_act="quick_gelu", - prompt_num_attention_heads=8, - prompt_attention_dropout=0.0, - prompt_projection_dropout=0.0, - logit_scale_init_value=2.6592, - **kwargs, - ): - # If `_config_dict` exist, we use them for the backward compatibility. - # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot - # of confusion!). - text_config_dict = kwargs.pop("text_config_dict", None) - vision_config_dict = kwargs.pop("vision_config_dict", None) + text_config: dict | PreTrainedConfig | None = None + vision_config: dict | PreTrainedConfig | None = None + projection_dim: int = 512 + prompt_layers: int = 2 + prompt_alpha: float = 0.1 + prompt_hidden_act: str = "quick_gelu" + prompt_num_attention_heads: int = 8 + prompt_attention_dropout: float | int = 0.0 + prompt_projection_dropout: float | int = 0.0 + logit_scale_init_value: float = 2.6592 + initializer_factor: float = 1.0 + + def __post_init__(self, **kwargs): + if self.text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `XCLIPTextConfig` with default values.") + elif isinstance(self.text_config, XCLIPTextConfig): + text_config = self.text_config.to_dict() + else: + text_config = self.text_config + + if self.vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. initializing the `XCLIPVisionConfig` with default values.") + elif isinstance(self.vision_config, XCLIPVisionConfig): + vision_config = self.vision_config.to_dict() + else: + vision_config = self.vision_config + # For backward compatibility check keyword args # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. - if text_config_dict is not None: - if text_config is None: - text_config = {} + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + if text_config_dict is not None: # This is the complete result when using `text_config_dict`. _text_config_dict = XCLIPTextConfig(**text_config_dict).to_dict() @@ -226,9 +197,6 @@ def __init__( text_config.update(_text_config_dict) if vision_config_dict is not None: - if vision_config is None: - vision_config = {} - # This is the complete result when using `vision_config_dict`. _vision_config_dict = XCLIPVisionConfig(**vision_config_dict).to_dict() # convert keys to string instead of integer @@ -257,32 +225,11 @@ def __init__( # Update all values in `vision_config` with the ones in `_vision_config_dict`. vision_config.update(_vision_config_dict) - if text_config is None: - text_config = XCLIPTextConfig() - logger.info("`text_config` is `None`. initializing the `XCLIPTextConfig` with default values.") - elif isinstance(text_config, dict): - text_config = XCLIPTextConfig(**text_config) + # Finally we can convert back our unified text/vision configs to `PretrainedConfig` + self.text_config = XCLIPTextConfig(**text_config) + self.vision_config = XCLIPVisionConfig(**vision_config) - if vision_config is None: - vision_config = XCLIPVisionConfig() - logger.info("`vision_config` is `None`. initializing the `XCLIPVisionConfig` with default values.") - elif isinstance(vision_config, dict): - vision_config = XCLIPVisionConfig(**vision_config) - - self.text_config = text_config - self.vision_config = vision_config - - self.projection_dim = projection_dim - self.prompt_layers = prompt_layers - self.prompt_alpha = prompt_alpha - self.prompt_hidden_act = prompt_hidden_act - self.prompt_num_attention_heads = prompt_num_attention_heads - self.prompt_attention_dropout = prompt_attention_dropout - self.prompt_projection_dropout = prompt_projection_dropout - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = 1.0 - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["XCLIPConfig", "XCLIPTextConfig", "XCLIPVisionConfig"] diff --git a/src/transformers/models/xcodec/configuration_xcodec.py b/src/transformers/models/xcodec/configuration_xcodec.py index 6c0479d425a6..1308d1c6efb7 100644 --- a/src/transformers/models/xcodec/configuration_xcodec.py +++ b/src/transformers/models/xcodec/configuration_xcodec.py @@ -16,6 +16,7 @@ import math import numpy as np +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @@ -23,6 +24,7 @@ @auto_docstring(checkpoint="Manel/X-Codec") +@strict(accept_kwargs=True) class XcodecConfig(PreTrainedConfig): r""" target_bandwidths (`List[float]`, *optional*, defaults to `[0.5, 1, 1.5, 2, 4]`): @@ -74,57 +76,48 @@ class XcodecConfig(PreTrainedConfig): _default_semantic_model_config_kwargs = {} - def __init__( - self, - target_bandwidths: list[float] | None = None, - sample_rate: int = 16000, - kernel_size: int = 3, - channel_ratios: list[float] = [1, 1], - strides: list[int] = [1, 1], - block_dilations: list[int] = [1, 1], - unit_kernel_size: int = 3, - codebook_size: int = 1024, - codebook_dim: int | None = None, - initializer_range: float = 0.02, - acoustic_model_config=None, - semantic_model_config=None, - **kwargs, - ): - if isinstance(acoustic_model_config, dict): - acoustic_model_config["model_type"] = acoustic_model_config.get("model_type", "dac") - acoustic_model_config = CONFIG_MAPPING[acoustic_model_config["model_type"]]( - **{**self._default_acoustic_model_config_kwargs, **acoustic_model_config} + target_bandwidths: list[int | float] | tuple[int | float, ...] = (0.5, 1, 1.5, 2, 4) + sample_rate: int = 16000 + kernel_size: int = 3 + channel_ratios: list[int] | tuple[int, ...] = (1, 1) + strides: list[int] | tuple[int, ...] = (1, 1) + block_dilations: list[int] | tuple[int, ...] = (1, 1) + unit_kernel_size: int = 3 + codebook_size: int = 1024 + codebook_dim: int | None = None + initializer_range: float = 0.02 + acoustic_model_config: dict | PreTrainedConfig | None = None + semantic_model_config: dict | PreTrainedConfig | None = None + + def __post_init__(self, **kwargs): + if self.acoustic_model_config is None: + self.acoustic_model_config = CONFIG_MAPPING["dac"]( + encoder_hidden_size=64, + # NOTE: original DAC uses [2, 4, 8, 8] `downsampling ratios`, namely reverse of `upsampling_ratios` + # (not sure if intentional by Xcodec but we keep it) + downsampling_ratios=[8, 5, 4, 2], + decoder_hidden_size=1024, + upsampling_ratios=[8, 5, 4, 2], + hidden_size=256, ) - elif acoustic_model_config is None: - acoustic_model_config = CONFIG_MAPPING["dac"](**self._default_acoustic_model_config_kwargs) - self.acoustic_model_config = acoustic_model_config - - if isinstance(semantic_model_config, dict): - semantic_model_config["model_type"] = semantic_model_config.get("model_type", "hubert") - semantic_model_config = CONFIG_MAPPING[semantic_model_config["model_type"]]( - **{**self._default_semantic_model_config_kwargs, **semantic_model_config} + elif isinstance(self.acoustic_model_config, dict): + self.acoustic_model_config["model_type"] = self.acoustic_model_config.get("model_type", "dac") + self.acoustic_model_config = CONFIG_MAPPING[self.acoustic_model_config["model_type"]]( + **{**self._default_acoustic_model_config_kwargs, **self.acoustic_model_config} ) - elif semantic_model_config is None: - semantic_model_config = CONFIG_MAPPING["hubert"](**self._default_semantic_model_config_kwargs) - self.semantic_model_config = semantic_model_config - - if target_bandwidths is None: - target_bandwidths = [0.5, 1, 1.5, 2, 4] - - self.target_bandwidths = target_bandwidths - self.sample_rate = sample_rate - self.kernel_size = kernel_size - self.channel_ratios = channel_ratios - self.strides = strides - self.block_dilations = block_dilations - self.unit_kernel_size = unit_kernel_size - self.codebook_size = codebook_size - self.initializer_range = initializer_range - if codebook_dim is None: - codebook_dim = self.acoustic_model_config.hidden_size + self.semantic_model_config.hidden_size - self.codebook_dim = codebook_dim - - super().__init__(**kwargs) + + if self.semantic_model_config is None: + self.semantic_model_config = CONFIG_MAPPING["hubert"]() + elif isinstance(self.semantic_model_config, dict): + self.semantic_model_config["model_type"] = self.semantic_model_config.get("model_type", "hubert") + self.semantic_model_config = CONFIG_MAPPING[self.semantic_model_config["model_type"]]( + **{**self._default_semantic_model_config_kwargs, **self.semantic_model_config} + ) + + if self.codebook_dim is None: + self.codebook_dim = self.acoustic_model_config.hidden_size + self.semantic_model_config.hidden_size + + super().__post_init__(**kwargs) @property def frame_rate(self) -> int: diff --git a/src/transformers/models/xglm/configuration_xglm.py b/src/transformers/models/xglm/configuration_xglm.py index 1f29ba5b990e..26fa0bfd0c8c 100644 --- a/src/transformers/models/xglm/configuration_xglm.py +++ b/src/transformers/models/xglm/configuration_xglm.py @@ -13,14 +13,14 @@ # limitations under the License. """XGLM model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/xglm-564M") +@strict(accept_kwargs=True) class XGLMConfig(PreTrainedConfig): r""" Example: @@ -47,52 +47,26 @@ class XGLMConfig(PreTrainedConfig): "num_hidden_layers": "num_layers", } - def __init__( - self, - vocab_size=256008, - max_position_embeddings=2048, - d_model=1024, - ffn_dim=4096, - num_layers=24, - attention_heads=16, - activation_function="gelu", - dropout=0.1, - attention_dropout=0.1, - activation_dropout=0.0, - layerdrop=0.0, - init_std=0.02, - scale_embedding=True, - use_cache=True, - decoder_start_token_id=2, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.ffn_dim = ffn_dim - self.num_layers = num_layers - self.attention_heads = attention_heads - self.activation_function = activation_function - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.layerdrop = layerdrop - self.init_std = init_std - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - self.use_cache = use_cache - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.decoder_start_token_id = decoder_start_token_id - - super().__init__(**kwargs) + vocab_size: int = 256008 + max_position_embeddings: int = 2048 + d_model: int = 1024 + ffn_dim: int = 4096 + num_layers: int = 24 + attention_heads: int = 16 + activation_function: str = "gelu" + dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + activation_dropout: float | int = 0.0 + layerdrop: float | int = 0.0 + init_std: float = 0.02 + scale_embedding: bool = True + use_cache: bool = True + decoder_start_token_id: int = 2 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["XGLMConfig"] diff --git a/src/transformers/models/xlm/configuration_xlm.py b/src/transformers/models/xlm/configuration_xlm.py index f9247ca6f7ec..c9a6a4ba59a6 100644 --- a/src/transformers/models/xlm/configuration_xlm.py +++ b/src/transformers/models/xlm/configuration_xlm.py @@ -13,14 +13,14 @@ # limitations under the License. """XLM configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="FacebookAI/xlm-mlm-en-2048") +@strict(accept_kwargs=True) class XLMConfig(PreTrainedConfig): r""" gelu_activation (`bool`, *optional*, defaults to `True`): @@ -108,79 +108,38 @@ class XLMConfig(PreTrainedConfig): "pad_index": "pad_token_id", } - def __init__( - self, - vocab_size=30145, - emb_dim=2048, - n_layers=12, - n_heads=16, - dropout=0.1, - attention_dropout=0.1, - gelu_activation=True, - sinusoidal_embeddings=False, - causal=False, - asm=False, - n_langs=1, - use_lang_emb=True, - max_position_embeddings=512, - embed_init_std=2048**-0.5, - layer_norm_eps=1e-12, - init_std=0.02, - unk_index=3, - mask_index=5, - is_encoder=True, - summary_type="first", - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, - start_n_top=5, - end_n_top=5, - mask_token_id=0, - lang_id=0, - pad_token_id=2, - bos_token_id=0, - eos_token_id=1, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.emb_dim = emb_dim - self.n_layers = n_layers - self.n_heads = n_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.gelu_activation = gelu_activation - self.sinusoidal_embeddings = sinusoidal_embeddings - self.causal = causal - self.asm = asm - self.n_langs = n_langs - self.use_lang_emb = use_lang_emb - self.layer_norm_eps = layer_norm_eps - self.unk_index = unk_index - self.mask_index = mask_index - self.is_encoder = is_encoder - self.max_position_embeddings = max_position_embeddings - self.embed_init_std = embed_init_std - self.init_std = init_std - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_proj_to_labels = summary_proj_to_labels - self.summary_first_dropout = summary_first_dropout - self.start_n_top = start_n_top - self.end_n_top = end_n_top - self.mask_token_id = mask_token_id - self.lang_id = lang_id - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - if "n_words" in kwargs: - self.n_words = kwargs["n_words"] - - super().__init__(**kwargs) + vocab_size: int = 30145 + emb_dim: int = 2048 + n_layers: int = 12 + n_heads: int = 16 + dropout: float | int = 0.1 + attention_dropout: float | int = 0.1 + gelu_activation: bool = True + sinusoidal_embeddings: bool = False + causal: bool = False + asm: bool = False + n_langs: int = 1 + use_lang_emb: bool = True + max_position_embeddings: int = 512 + embed_init_std: float = 2048**-0.5 + layer_norm_eps: float = 1e-12 + init_std: float = 0.02 + unk_index: int = 3 + mask_index: int = 5 + is_encoder: bool = True + summary_type: str = "first" + summary_use_proj: bool = True + summary_activation: str | None = None + summary_proj_to_labels: bool = True + summary_first_dropout: float | int = 0.1 + start_n_top: int = 5 + end_n_top: int = 5 + mask_token_id: int | None = 0 + lang_id: int = 0 + pad_token_id: int | None = 2 + bos_token_id: int | None = 0 + eos_token_id: int | None = 1 + tie_word_embeddings: bool = True __all__ = ["XLMConfig"] diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py index 8b45a56a1b57..c300a0b2837c 100755 --- a/src/transformers/models/xlm/modeling_xlm.py +++ b/src/transformers/models/xlm/modeling_xlm.py @@ -782,7 +782,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None: bs, slen = input_ids.size() @@ -1008,7 +1008,7 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1101,7 +1101,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1206,7 +1206,7 @@ def forward( Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential decoding. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1334,7 +1334,7 @@ def forward( >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) >>> loss = outputs.loss ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1426,7 +1426,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, @@ -1538,7 +1538,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None diff --git a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py index eb37a543a794..c79bc9c6659f 100644 --- a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py @@ -14,14 +14,14 @@ # limitations under the License. """XLM-RoBERTa configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="FacebookAI/xlm-mlm-en-2048") +@strict(accept_kwargs=True) class XLMRobertaConfig(PreTrainedConfig): r""" Examples: @@ -41,52 +41,26 @@ class XLMRobertaConfig(PreTrainedConfig): model_type = "xlm-roberta" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - use_cache=True, - classifier_dropout=None, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + use_cache: bool = True + classifier_dropout: float | int | None = None + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["XLMRobertaConfig"] diff --git a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py index 3b44f1f48bc7..c10ea187db5f 100644 --- a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py @@ -13,14 +13,14 @@ # limitations under the License. """XLM_ROBERTa_XL configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="FacebookAI/xlm-roberta-xl") +@strict(accept_kwargs=True) class XLMRobertaXLConfig(PreTrainedConfig): r""" Examples: @@ -40,51 +40,26 @@ class XLMRobertaXLConfig(PreTrainedConfig): model_type = "xlm-roberta-xl" - def __init__( - self, - vocab_size=250880, - hidden_size=2560, - num_hidden_layers=36, - num_attention_heads=32, - intermediate_size=10240, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=514, - type_vocab_size=1, - initializer_range=0.02, - layer_norm_eps=1e-05, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - use_cache=True, - classifier_dropout=None, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout + vocab_size: int = 250880 + hidden_size: int = 2560 + num_hidden_layers: int = 36 + num_attention_heads: int = 32 + intermediate_size: int = 10240 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 514 + type_vocab_size: int = 1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-05 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + use_cache: bool = True + classifier_dropout: float | int | None = None + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["XLMRobertaXLConfig"] diff --git a/src/transformers/models/xlnet/configuration_xlnet.py b/src/transformers/models/xlnet/configuration_xlnet.py index 07ae42633b23..a48ed19f6031 100644 --- a/src/transformers/models/xlnet/configuration_xlnet.py +++ b/src/transformers/models/xlnet/configuration_xlnet.py @@ -14,6 +14,8 @@ # limitations under the License. """XLNet configuration""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, logging @@ -22,6 +24,7 @@ @auto_docstring(checkpoint="xlnet/xlnet-large-cased") +@strict(accept_kwargs=True) class XLNetConfig(PreTrainedConfig): r""" ff_activation (`str` or `Callable`, *optional*, defaults to `"gelu"`): @@ -103,78 +106,47 @@ class XLNetConfig(PreTrainedConfig): "num_hidden_layers": "n_layer", } - def __init__( - self, - vocab_size=32000, - d_model=1024, - n_layer=24, - n_head=16, - d_inner=4096, - ff_activation="gelu", - attn_type="bi", - initializer_range=0.02, - layer_norm_eps=1e-12, - dropout=0.1, - mem_len=512, - reuse_len=None, - use_mems_eval=True, - use_mems_train=False, - bi_data=False, - clamp_len=-1, - same_length=False, - summary_type="last", - summary_use_proj=True, - summary_activation="tanh", - summary_last_dropout=0.1, - start_n_top=5, - end_n_top=5, - pad_token_id=5, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.d_model = d_model - self.n_layer = n_layer - self.n_head = n_head - if d_model % n_head != 0: - raise ValueError(f"'d_model % n_head' ({d_model % n_head}) should be equal to 0") - if "d_head" in kwargs: - if kwargs["d_head"] != d_model // n_head: - raise ValueError( - f"`d_head` ({kwargs['d_head']}) should be equal to `d_model // n_head` ({d_model // n_head})" - ) - self.d_head = d_model // n_head - self.ff_activation = ff_activation - self.d_inner = d_inner - self.attn_type = attn_type - - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - - self.dropout = dropout - self.mem_len = mem_len - self.reuse_len = reuse_len - self.bi_data = bi_data - self.clamp_len = clamp_len - self.same_length = same_length - - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_last_dropout = summary_last_dropout - self.start_n_top = start_n_top - self.end_n_top = end_n_top - - self.bos_token_id = bos_token_id - self.pad_token_id = pad_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.use_mems_eval = use_mems_eval - self.use_mems_train = use_mems_train - super().__init__(**kwargs) + vocab_size: int = 32000 + d_model: int = 1024 + n_layer: int = 24 + n_head: int = 16 + d_inner: int = 4096 + d_head: int | None = None + ff_activation: str = "gelu" + attn_type: str = "bi" + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + dropout: float | int = 0.1 + mem_len: int | None = 512 + reuse_len: int | None = None + use_mems_eval: bool = True + use_mems_train: bool = False + bi_data: bool = False + clamp_len: int = -1 + same_length: bool = False + summary_type: str = "last" + summary_use_proj: bool = True + summary_activation: str = "tanh" + summary_last_dropout: float | int = 0.1 + start_n_top: int = 5 + end_n_top: int = 5 + pad_token_id: int | None = 5 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + tie_word_embeddings: bool = True + + def __post_init__(self, **kwargs): + self.d_head = self.d_head or self.d_model // self.n_head + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.d_model % self.n_head != 0: + raise ValueError(f"'d_model % n_head' ({self.d_model % self.n_head}) should be equal to 0") + if self.d_head != self.d_model // self.n_head: + raise ValueError( + f"`d_head` ({self.d_head}) should be equal to `d_model // n_head` ({self.d_model // self.n_head})" + ) @property def max_position_embeddings(self): diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py index c5a2446a4bb9..01486934ac37 100755 --- a/src/transformers/models/xlnet/modeling_xlnet.py +++ b/src/transformers/models/xlnet/modeling_xlnet.py @@ -1030,7 +1030,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if self.training: use_mems = use_mems if use_mems is not None else self.config.use_mems_train @@ -1403,7 +1403,7 @@ def forward( ... outputs.logits ... ) # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1529,7 +1529,7 @@ def forward( states from previous forward passes to compute attention, which can significantly improve performance for sequential decoding tasks. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, @@ -1656,7 +1656,7 @@ def forward( states from previous forward passes to compute attention, which can significantly improve performance for sequential decoding tasks. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, @@ -1780,7 +1780,7 @@ def forward( states from previous forward passes to compute attention, which can significantly improve performance for sequential decoding tasks. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] @@ -1904,7 +1904,7 @@ def forward( states from previous forward passes to compute attention, which can significantly improve performance for sequential decoding tasks. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.transformer( input_ids, @@ -2057,7 +2057,7 @@ def forward( >>> loss = outputs.loss ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict transformer_outputs = self.transformer( input_ids, diff --git a/src/transformers/models/xlstm/configuration_xlstm.py b/src/transformers/models/xlstm/configuration_xlstm.py index f4d32e523b05..fd8901ed40a2 100644 --- a/src/transformers/models/xlstm/configuration_xlstm.py +++ b/src/transformers/models/xlstm/configuration_xlstm.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. - """xLSTM configuration.""" +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, is_xlstm_available, logging +from ...utils import auto_docstring, is_xlstm_available if is_xlstm_available(): @@ -52,10 +53,8 @@ def round_up_to_next_multiple_of(x: int, multiple_of: int) -> int: external_xlstm = False -logger = logging.get_logger(__name__) - - @auto_docstring(checkpoint="NX-AI/xLSTM-7b") +@strict(accept_kwargs=True) class xLSTMConfig(PreTrainedConfig): """ num_blocks (int, optional, *optional*, defaults to 32): @@ -118,93 +117,45 @@ class xLSTMConfig(PreTrainedConfig): model_type = "xlstm" - def __init__( - self, - vocab_size: int = 50304, - hidden_size: int = 4096, - embedding_dim: int | None = None, - num_hidden_layers: int | None = 32, - num_blocks: int | None = None, - num_heads: int = 8, - use_bias: bool = False, - norm_reduction_force_float32: bool = True, - tie_word_embeddings: bool = False, - add_out_norm: bool = True, - norm_eps: float = 1e-6, - # mlstm_layer - qk_dim_factor: float = 0.5, - v_dim_factor: float = 1.0, - # mlstm backend - chunkwise_kernel: ChunkwiseKernelType = "chunkwise--native_autograd", - sequence_kernel: SequenceKernelType = "native_sequence__native", - step_kernel: StepKernelType = "native", - # needed to enable generation - mode: BackendModeType = "inference", - chunk_size: int = 64, - # needed to be true for generation - return_last_states: bool = True, - autocast_kernel_dtype: DtypeType = "bfloat16", - eps: float = 1e-6, - inference_state_dtype: DtypeType = "float32", - # feedforward - ffn_proj_factor: float = 2.667, - ffn_round_up_to_multiple_of: int = 64, - # capping - gate_soft_cap: float = 15.0, - output_logit_soft_cap: float = 30.0, - # weights - weight_mode: WeightModeType = "single", - # HF interface - use_cache: bool = True, - pad_token_id: int = 1, - bos_token_id: int = 0, - eos_token_id: int = 2, - max_inference_chunksize: int = 16384, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size if hidden_size is not None else embedding_dim - self.embedding_dim = embedding_dim if embedding_dim is not None else hidden_size - self.num_hidden_layers = num_hidden_layers if num_hidden_layers is not None else num_blocks - self.num_blocks = num_blocks if num_blocks is not None else num_hidden_layers - self.num_heads = num_heads - self.use_bias = use_bias - self.tie_word_embeddings = tie_word_embeddings - self.add_out_norm = add_out_norm - self.norm_eps = norm_eps - self.norm_reduction_force_float32 = norm_reduction_force_float32 - # mlstm_layer - self.qk_dim_factor = qk_dim_factor - self.v_dim_factor = v_dim_factor - # mlstm backend - self.chunkwise_kernel = chunkwise_kernel - self.sequence_kernel = sequence_kernel - self.step_kernel = step_kernel - self.mode = mode - self.chunk_size = chunk_size - self.return_last_states = return_last_states - self.autocast_kernel_dtype = autocast_kernel_dtype - self.eps = eps - self.inference_state_dtype = inference_state_dtype - # feedforward - self.ffn_proj_factor = ffn_proj_factor - self.ffn_round_up_to_multiple_of = ffn_round_up_to_multiple_of - # capping - self.gate_soft_cap = gate_soft_cap - self.output_logit_soft_cap = output_logit_soft_cap - self.weight_mode = weight_mode - - self.use_cache = use_cache - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.max_inference_chunksize = max_inference_chunksize - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - self.tie_word_embeddings = tie_word_embeddings - super().__init__(**kwargs) + vocab_size: int = 50304 + hidden_size: int = 4096 + embedding_dim: int | None = None + num_hidden_layers: int = 32 + num_blocks: int | None = None + num_heads: int = 8 + use_bias: bool = False + norm_reduction_force_float32: bool = True + tie_word_embeddings: bool = False + add_out_norm: bool = True + norm_eps: float = 1e-6 + qk_dim_factor: float = 0.5 + v_dim_factor: float = 1.0 + chunkwise_kernel: ChunkwiseKernelType = "chunkwise--native_autograd" + sequence_kernel: SequenceKernelType = "native_sequence__native" + step_kernel: StepKernelType = "native" + mode: BackendModeType = "inference" + chunk_size: int = 64 + return_last_states: bool = True + autocast_kernel_dtype: DtypeType = "bfloat16" + eps: float = 1e-6 + inference_state_dtype: DtypeType = "float32" + ffn_proj_factor: float = 2.667 + ffn_round_up_to_multiple_of: int = 64 + gate_soft_cap: float = 15.0 + output_logit_soft_cap: float = 30.0 + weight_mode: WeightModeType = "single" + use_cache: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 2 + max_inference_chunksize: int = 16384 + + def __post_init__(self, **kwargs): + self.hidden_size = self.hidden_size if self.hidden_size is not None else self.embedding_dim + self.embedding_dim = self.embedding_dim if self.embedding_dim is not None else self.hidden_size + self.num_hidden_layers = self.num_hidden_layers if self.num_hidden_layers is not None else self.num_blocks + self.num_blocks = self.num_blocks if self.num_blocks is not None else self.num_hidden_layers + super().__post_init__(**kwargs) @property def qk_dim(self): diff --git a/src/transformers/models/xmod/configuration_xmod.py b/src/transformers/models/xmod/configuration_xmod.py index 6f67c80d5155..885540e2a17d 100644 --- a/src/transformers/models/xmod/configuration_xmod.py +++ b/src/transformers/models/xmod/configuration_xmod.py @@ -14,14 +14,14 @@ # limitations under the License. """X-MOD configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/xmod-base") +@strict(accept_kwargs=True) class XmodConfig(PreTrainedConfig): r""" pre_norm (`bool`, *optional*, defaults to `False`): @@ -57,66 +57,33 @@ class XmodConfig(PreTrainedConfig): model_type = "xmod" - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - use_cache=True, - classifier_dropout=None, - pre_norm=False, - adapter_reduction_factor=2, - adapter_layer_norm=False, - adapter_reuse_layer_norm=True, - ln_before_adapter=True, - languages=("en_XX",), - default_language=None, - is_decoder=False, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.is_decoder = is_decoder - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.classifier_dropout = classifier_dropout - self.pre_norm = pre_norm - self.adapter_reduction_factor = adapter_reduction_factor - self.adapter_layer_norm = adapter_layer_norm - self.adapter_reuse_layer_norm = adapter_reuse_layer_norm - self.ln_before_adapter = ln_before_adapter - self.languages = list(languages) - self.default_language = default_language + vocab_size: int = 30522 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 512 + type_vocab_size: int = 2 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + use_cache: bool = True + classifier_dropout: float | int | None = None + pre_norm: bool = False + adapter_reduction_factor: int = 2 + adapter_layer_norm: bool = False + adapter_reuse_layer_norm: bool = True + ln_before_adapter: bool = True + languages: list[str] | tuple[str, ...] = ("en_XX",) + default_language: str | None = None + is_decoder: bool = False + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["XmodConfig"] diff --git a/src/transformers/models/yolos/configuration_yolos.py b/src/transformers/models/yolos/configuration_yolos.py index 5149cbe3194b..966632e9c47d 100644 --- a/src/transformers/models/yolos/configuration_yolos.py +++ b/src/transformers/models/yolos/configuration_yolos.py @@ -13,14 +13,14 @@ # limitations under the License. """YOLOS model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="hustvl/yolos-base") +@strict(accept_kwargs=True) class YolosConfig(PreTrainedConfig): r""" num_detection_tokens (`int`, *optional*, defaults to 100): @@ -45,58 +45,28 @@ class YolosConfig(PreTrainedConfig): model_type = "yolos" - def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - layer_norm_eps=1e-12, - image_size=[512, 864], - patch_size=16, - num_channels=3, - qkv_bias=True, - num_detection_tokens=100, - use_mid_position_embeddings=True, - auxiliary_loss=False, - class_cost=1, - bbox_cost=5, - giou_cost=2, - bbox_loss_coefficient=5, - giou_loss_coefficient=2, - eos_coefficient=0.1, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.num_detection_tokens = num_detection_tokens - self.use_mid_position_embeddings = use_mid_position_embeddings - self.auxiliary_loss = auxiliary_loss - # Hungarian matcher - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - # Loss coefficients - self.bbox_loss_coefficient = bbox_loss_coefficient - self.giou_loss_coefficient = giou_loss_coefficient - self.eos_coefficient = eos_coefficient + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + image_size: list[int] | tuple[int, ...] = (512, 864) + patch_size: int | list[int] | tuple[int, int] = 16 + num_channels: int = 3 + qkv_bias: bool = True + num_detection_tokens: int = 100 + use_mid_position_embeddings: bool = True + auxiliary_loss: bool = False + class_cost: int = 1 + bbox_cost: int = 5 + giou_cost: int = 2 + bbox_loss_coefficient: int = 5 + giou_loss_coefficient: int = 2 + eos_coefficient: float = 0.1 __all__ = ["YolosConfig"] diff --git a/src/transformers/models/yoso/configuration_yoso.py b/src/transformers/models/yoso/configuration_yoso.py index 2de830a9e69b..f66e26783322 100644 --- a/src/transformers/models/yoso/configuration_yoso.py +++ b/src/transformers/models/yoso/configuration_yoso.py @@ -13,14 +13,14 @@ # limitations under the License. """YOSO model configuration""" -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="uw-madison/yoso-4096") +@strict(accept_kwargs=True) class YosoConfig(PreTrainedConfig): r""" use_expectation (`bool`, *optional*, defaults to `True`): @@ -53,58 +53,29 @@ class YosoConfig(PreTrainedConfig): model_type = "yoso" - def __init__( - self, - vocab_size=50265, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=4096, - type_vocab_size=1, - initializer_range=0.02, - layer_norm_eps=1e-12, - use_expectation=True, - hash_code_len=9, - num_hash=64, - conv_window=None, - use_fast_hash=True, - lsh_backward=True, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - add_cross_attention=False, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__(**kwargs) - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - - self.add_cross_attention = add_cross_attention - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.type_vocab_size = type_vocab_size - self.layer_norm_eps = layer_norm_eps - self.use_expectation = use_expectation - self.hash_code_len = hash_code_len - self.num_hash = num_hash - self.conv_window = conv_window - self.use_fast_hash = use_fast_hash - self.lsh_backward = lsh_backward + vocab_size: int = 50265 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.1 + attention_probs_dropout_prob: float = 0.1 + max_position_embeddings: int = 4096 + type_vocab_size: int = 1 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + use_expectation: bool = True + hash_code_len: int = 9 + num_hash: int = 64 + conv_window: int | None = None + use_fast_hash: bool = True + lsh_backward: bool = True + pad_token_id: int | None = 1 + bos_token_id: int | None = 0 + eos_token_id: int | None = 2 + add_cross_attention: bool = False + tie_word_embeddings: bool = True __all__ = ["YosoConfig"] diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index 32105a05337f..b8c2c6dc6c41 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -649,7 +649,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -744,7 +744,7 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.yoso( input_ids, @@ -834,7 +834,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.yoso( input_ids, @@ -940,7 +940,7 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1020,7 +1020,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.yoso( input_ids, @@ -1093,7 +1093,7 @@ def forward( return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.yoso( input_ids, diff --git a/src/transformers/models/youtu/configuration_youtu.py b/src/transformers/models/youtu/configuration_youtu.py index 1c1a3b69f1f7..e9f2efb79697 100644 --- a/src/transformers/models/youtu/configuration_youtu.py +++ b/src/transformers/models/youtu/configuration_youtu.py @@ -23,12 +23,16 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="tencent/Youtu-LLM-2B") +@strict(accept_kwargs=True) class YoutuConfig(PreTrainedConfig): r""" embedding_initializer_range (`float`, *optional*): @@ -58,81 +62,46 @@ class YoutuConfig(PreTrainedConfig): } attribute_map = {} - def __init__( - self, - vocab_size: int | None = 128256, - hidden_size: int | None = 2048, - intermediate_size: int | None = 6144, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 16, - kv_lora_rank: int | None = 512, - q_lora_rank: int | None = 1536, - qk_rope_head_dim: int | None = 64, - v_head_dim: int | None = 128, - qk_nope_head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 131072, - initializer_range: float | None = None, - embedding_initializer_range: float | None = None, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 128000, - eos_token_id: int | None = 128001, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] = None, - rope_interleave: bool | None = True, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim - self.head_dim = qk_rope_head_dim - self.rope_interleave = rope_interleave - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + vocab_size: int = 128256 + hidden_size: int = 2048 + intermediate_size: int = 6144 + num_hidden_layers: int = 32 + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + kv_lora_rank: int = 512 + q_lora_rank: int = 1536 + qk_rope_head_dim: int = 64 + v_head_dim: int | None = 128 + qk_nope_head_dim: int = 128 + hidden_act: str = "silu" + max_position_embeddings: int = 131072 + initializer_range: float | None = None + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 128000 + eos_token_id: int | list[int] | None = 128001 + tie_word_embeddings: bool = True + rope_parameters: RopeParameters | dict | None = None + rope_interleave: bool | None = True + attention_bias: bool = False + attention_dropout: float | int | None = 0.0 + embedding_initializer_range: float | None = None - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.rope_parameters = rope_parameters - - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) - - # if initializer_range is None, set it to 2.0 / (5.0 * self.hidden_size) ** 0.5 (if hidden size is valid) + def __post_init__(self, **kwargs): if self.initializer_range is None: if self.hidden_size != 0: self.initializer_range = 2.0 / (5.0 * self.hidden_size) ** 0.5 else: self.initializer_range = 0.02 - # if embedding_initializer_range is None, set it to 2.0 * self.initializer_range - if embedding_initializer_range is None: - self.embedding_initializer_range = 2.0 * self.initializer_range - else: - self.embedding_initializer_range = embedding_initializer_range + self.embedding_initializer_range = self.embedding_initializer_range or 2.0 * self.initializer_range + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + self.head_dim = self.qk_rope_head_dim + super().__post_init__(**kwargs) __all__ = ["YoutuConfig"] diff --git a/src/transformers/models/youtu/modeling_youtu.py b/src/transformers/models/youtu/modeling_youtu.py index f0b4981fe01f..d40bef358da6 100644 --- a/src/transformers/models/youtu/modeling_youtu.py +++ b/src/transformers/models/youtu/modeling_youtu.py @@ -23,6 +23,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import math from collections.abc import Callable from typing import Optional diff --git a/src/transformers/models/youtu/modular_youtu.py b/src/transformers/models/youtu/modular_youtu.py index a77af6d4a4a2..5bcafd026414 100644 --- a/src/transformers/models/youtu/modular_youtu.py +++ b/src/transformers/models/youtu/modular_youtu.py @@ -17,11 +17,12 @@ # See the License for the specific language governing permissions and # limitations under the License. + import torch +from huggingface_hub.dataclasses import strict from torch import nn from ... import initialization as init -from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, logging from ..deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config @@ -41,6 +42,7 @@ @auto_docstring(checkpoint="tencent/Youtu-LLM-2B") +@strict(accept_kwargs=True) class YoutuConfig(DeepseekV3Config): r""" embedding_initializer_range (`float`, *optional*): @@ -64,88 +66,43 @@ class YoutuConfig(DeepseekV3Config): } attribute_map = {} - def __init__( - self, - vocab_size: int | None = 128256, - hidden_size: int | None = 2048, - intermediate_size: int | None = 6144, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 16, - num_key_value_heads: int | None = 16, - kv_lora_rank: int | None = 512, - q_lora_rank: int | None = 1536, - qk_rope_head_dim: int | None = 64, - v_head_dim: int | None = 128, - qk_nope_head_dim: int | None = 128, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 131072, - initializer_range: float | None = None, - embedding_initializer_range: float | None = None, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 128000, - eos_token_id: int | None = 128001, - tie_word_embeddings: bool | None = True, - rope_parameters: RopeParameters | dict[str, RopeParameters] = None, - rope_interleave: bool | None = True, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - **kwargs, - ): - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - kv_lora_rank=kv_lora_rank, - q_lora_rank=q_lora_rank, - qk_rope_head_dim=qk_rope_head_dim, - v_head_dim=v_head_dim, - qk_nope_head_dim=qk_nope_head_dim, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - rope_interleave=rope_interleave, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - **kwargs, - ) - - # remove unused attribute - del self.n_shared_experts - del self.n_routed_experts - del self.routed_scaling_factor - del self.n_group - del self.topk_group - del self.num_experts_per_tok - del self.first_k_dense_replace - del self.norm_topk_prob - del self.pretraining_tp - del self.moe_intermediate_size - - # if initializer_range is None, set it to 2.0 / (5.0 * self.hidden_size) ** 0.5 (if hidden size is valid) + vocab_size: int = 128256 + hidden_size: int = 2048 + intermediate_size: int = 6144 + num_hidden_layers: int = 32 + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + max_position_embeddings: int = 131072 + initializer_range: float | None = None + embedding_initializer_range: float | None = None + pad_token_id: int | None = None + bos_token_id: int | None = 128000 + eos_token_id: int | list[int] | None = 128001 + tie_word_embeddings: bool = True + + # remove unused attribute + n_shared_experts = AttributeError() + n_routed_experts = AttributeError() + routed_scaling_factor = AttributeError() + n_group = AttributeError() + topk_group = AttributeError() + num_experts_per_tok = AttributeError() + first_k_dense_replace = AttributeError() + norm_topk_prob = AttributeError() + pretraining_tp = AttributeError() + moe_intermediate_size = AttributeError() + + def __post_init__(self, **kwargs): if self.initializer_range is None: if self.hidden_size != 0: self.initializer_range = 2.0 / (5.0 * self.hidden_size) ** 0.5 else: self.initializer_range = 0.02 - # if embedding_initializer_range is None, set it to 2.0 * self.initializer_range - if embedding_initializer_range is None: - self.embedding_initializer_range = 2.0 * self.initializer_range - else: - self.embedding_initializer_range = embedding_initializer_range + self.embedding_initializer_range = self.embedding_initializer_range or 2.0 * self.initializer_range + super().__post_init__(**kwargs) - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None = None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): raise AttributeError("Not overwritten for the Youtu model!") diff --git a/src/transformers/models/zamba/configuration_zamba.py b/src/transformers/models/zamba/configuration_zamba.py index eeb652d47c70..a1862cf4a39f 100644 --- a/src/transformers/models/zamba/configuration_zamba.py +++ b/src/transformers/models/zamba/configuration_zamba.py @@ -15,14 +15,14 @@ import math -from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging - +from huggingface_hub.dataclasses import strict -logger = logging.get_logger(__name__) +from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring @auto_docstring(checkpoint="Zyphra/Zamba-7B-v1") +@strict(accept_kwargs=True) class ZambaConfig(PreTrainedConfig): r""" attention_hidden_size (`int`, *optional*): @@ -54,95 +54,53 @@ class ZambaConfig(PreTrainedConfig): model_type = "zamba" keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size=32000, - tie_word_embeddings=True, - hidden_size=3712, - attention_hidden_size=None, - intermediate_size=14848, - num_hidden_layers=76, - num_attention_heads=16, - attention_head_dim=None, - num_key_value_heads=16, - n_mamba_heads=2, - hidden_act="gelu", - hidden_mamba_act="silu", - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - num_logits_to_keep=1, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - max_position_embeddings=4096, - attention_dropout=0.0, - attn_layer_period=6, - attn_layer_offset=4, - use_mamba_kernels=True, - mamba_d_state=16, - mamba_d_conv=4, - mamba_expand=2, - mamba_dt_rank="auto", - time_step_min=0.001, - time_step_max=0.1, - time_step_floor=1e-4, - mamba_conv_bias=True, - mamba_proj_bias=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.tie_word_embeddings = tie_word_embeddings - self.hidden_size = hidden_size - if attention_hidden_size is None: - self.attention_hidden_size = 2 * hidden_size - else: - self.attention_hidden_size = attention_hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - if attention_head_dim is None: - self.attention_head_dim = 2 * self.hidden_size // self.num_attention_heads - else: - self.attention_head_dim = attention_head_dim - self.max_position_embeddings = max_position_embeddings - self.attention_dropout = attention_dropout - - self.num_key_value_heads = num_key_value_heads - self.n_mamba_heads = n_mamba_heads - self.hidden_act = hidden_act - self.hidden_mamba_act = hidden_mamba_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - - self.use_cache = use_cache - self.num_logits_to_keep = num_logits_to_keep - - self.attn_layer_period = attn_layer_period - self.attn_layer_offset = attn_layer_offset - - self.use_mamba_kernels = use_mamba_kernels - self.mamba_d_state = mamba_d_state - self.mamba_d_conv = mamba_d_conv - self.mamba_expand = mamba_expand - self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank - self.time_step_min = time_step_min - self.time_step_max = time_step_max - self.time_step_floor = time_step_floor - self.mamba_conv_bias = mamba_conv_bias - self.mamba_proj_bias = mamba_proj_bias - - self.layers_block_type = self._layers_block_type(num_hidden_layers, attn_layer_period, attn_layer_offset) - - assert (self.mamba_expand * self.hidden_size) % self.n_mamba_heads == 0, ( - "`intermediate_size` should be divisible by `n_mamba_heads`." + vocab_size: int = 32000 + tie_word_embeddings: bool = True + hidden_size: int = 3712 + attention_hidden_size: int | None = None + intermediate_size: int = 14848 + num_hidden_layers: int = 76 + num_attention_heads: int = 16 + attention_head_dim: int | None = None + num_key_value_heads: int = 16 + n_mamba_heads: int = 2 + hidden_act: str = "gelu" + hidden_mamba_act: str = "silu" + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + num_logits_to_keep: int = 1 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | None = 2 + max_position_embeddings: int = 4096 + attention_dropout: float | int = 0.0 + attn_layer_period: int = 6 + attn_layer_offset: int = 4 + use_mamba_kernels: bool = True + mamba_d_state: int = 16 + mamba_d_conv: int = 4 + mamba_expand: int = 2 + mamba_dt_rank: str | int = "auto" + time_step_min: float = 0.001 + time_step_max: float = 0.1 + time_step_floor: float = 1e-4 + mamba_conv_bias: bool = True + mamba_proj_bias: bool = False + + def __post_init__(self, **kwargs): + self.attention_hidden_size = self.attention_hidden_size or 2 * self.hidden_size + self.attention_head_dim = self.attention_head_dim or 2 * self.hidden_size // self.num_attention_heads + self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if self.mamba_dt_rank == "auto" else self.mamba_dt_rank + self.layers_block_type = self._layers_block_type( + self.num_hidden_layers, self.attn_layer_period, self.attn_layer_offset ) + super().__post_init__(**kwargs) - self.tie_word_embeddings = tie_word_embeddings - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - super().__init__(**kwargs) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if (self.mamba_expand * self.hidden_size) % self.n_mamba_heads != 0: + raise ValueError("`intermediate_size` should be divisible by `n_mamba_heads`.") def _layers_block_type(self, num_hidden_layers, attn_layer_period, attn_layer_offset): layers = [ diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index 4c6d28af5706..d5f2673dc08d 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -14,12 +14,15 @@ # limitations under the License. +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters from ...utils import auto_docstring @auto_docstring(checkpoint="Zyphra/Zamba2-2.7B") +@strict(accept_kwargs=True) class Zamba2Config(PreTrainedConfig): r""" mamba_ngroups (`int`, *optional*, defaults to 1): @@ -66,94 +69,60 @@ class Zamba2Config(PreTrainedConfig): attribute_map = {"head_dim": "attention_head_dim"} keys_to_ignore_at_inference = ["past_key_values"] - def __init__( - self, - vocab_size: int | None = 32000, - max_position_embeddings: int | None = 4096, - hidden_size: int | None = 2560, - num_hidden_layers: int | None = 54, - layers_block_type: list[str] | None = None, - mamba_d_state: int | None = 64, - mamba_d_conv: int | None = 4, - mamba_expand: int | None = 2, - mamba_ngroups: int | None = 1, - time_step_min: float | None = 0.001, - time_step_max: float | None = 0.1, - time_step_floor: int | None = 1e-4, - time_step_limit: int | None = None, - n_mamba_heads: int | None = 8, - use_conv_bias: bool | None = True, - chunk_size: int | None = 256, - use_mem_eff_path: bool | None = False, - add_bias_linear: bool | None = False, - intermediate_size: int | None = None, - hidden_act: str | None = "gelu", - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - attention_dropout: float | None = 0.0, - num_mem_blocks: int | None = 1, - use_shared_attention_adapter: bool | None = False, - adapter_rank: int | None = 128, - use_mem_rope: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-5, - use_cache: bool | None = True, - num_logits_to_keep: int | None = 1, - pad_token_id: int | None = 0, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - use_long_context: bool | None = False, - tie_word_embeddings: bool | None = True, - **kwargs, - ): - self.pad_token_id = pad_token_id - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.tie_word_embeddings = tie_word_embeddings - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - if intermediate_size is None: - self.intermediate_size = 4 * hidden_size - else: - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_mem_blocks = num_mem_blocks - self.attention_hidden_size = 2 * hidden_size - self.attention_head_dim = 2 * self.hidden_size // self.num_attention_heads - self.attention_dropout = attention_dropout - self.use_mem_rope = use_mem_rope - self.use_long_context = use_long_context - self.rope_parameters = rope_parameters + vocab_size: int = 32000 + max_position_embeddings: int = 4096 + hidden_size: int = 2560 + num_hidden_layers: int = 54 + layers_block_type: list[str] | None = None + mamba_d_state: int = 64 + mamba_d_conv: int = 4 + mamba_expand: int = 2 + mamba_ngroups: int = 1 + time_step_min: float = 0.001 + time_step_max: float = 0.1 + time_step_floor: float = 1e-4 + time_step_limit: list[float] | tuple[float, ...] | None = None + n_mamba_heads: int = 8 + use_conv_bias: bool = True + chunk_size: int = 256 + use_mem_eff_path: bool = False + add_bias_linear: bool = False + intermediate_size: int | None = None + hidden_act: str = "gelu" + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + attention_dropout: float | int = 0.0 + num_mem_blocks: int = 1 + use_shared_attention_adapter: bool = False + adapter_rank: int = 128 + use_mem_rope: bool = False + rope_parameters: RopeParameters | dict | None = None + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-5 + use_cache: bool = True + num_logits_to_keep: int = 1 + pad_token_id: int | None = 0 + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + use_long_context: bool = False + tie_word_embeddings: bool = True - self.mamba_d_state = mamba_d_state - self.mamba_d_conv = mamba_d_conv - self.mamba_expand = mamba_expand - self.add_bias_linear = add_bias_linear - self.mamba_ngroups = mamba_ngroups - self.n_mamba_heads = n_mamba_heads - self.mamba_headdim = int(mamba_expand * hidden_size) // n_mamba_heads - self.use_conv_bias = use_conv_bias - self.chunk_size = chunk_size - self.time_step_limit = time_step_limit - self.use_shared_attention_adapter = use_shared_attention_adapter - self.adapter_rank = adapter_rank - self.time_step_min = time_step_min - self.time_step_max = time_step_max - self.time_step_floor = time_step_floor - if use_long_context: + def __post_init__(self, **kwargs): + self.intermediate_size = self.intermediate_size or 4 * self.hidden_size + self.attention_hidden_size = 2 * self.hidden_size + self.attention_head_dim = 2 * self.hidden_size // self.num_attention_heads + self.mamba_headdim = int(self.mamba_expand * self.hidden_size) // self.n_mamba_heads + if self.use_long_context: self.max_position_embeddings = 16384 - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.num_attention_heads = num_attention_heads + + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + self.kv_channels = self.hidden_size // self.num_attention_heads self.num_query_groups = self.num_attention_heads + # Below, "mamba" stands for mamba layer, "hybrid" stands for hybrid layer (composed by a shared transformer followed by mamba layer) - if layers_block_type is None: + if self.layers_block_type is None: self.layers_block_type = ( ["mamba"] + (["mamba"] * 5 + ["hybrid"]) * 7 @@ -163,15 +132,8 @@ def __init__( + ["hybrid"] + ["mamba"] * 2 ) - else: - self.layers_block_type = layers_block_type - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.num_logits_to_keep = num_logits_to_keep self.hybrid_layer_ids = [index for index, type in enumerate(self.layers_block_type) if type == "hybrid"] - self.use_mem_eff_path = use_mem_eff_path - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["Zamba2Config"] diff --git a/src/transformers/models/zoedepth/configuration_zoedepth.py b/src/transformers/models/zoedepth/configuration_zoedepth.py index ff425aa200a8..852885d6d536 100644 --- a/src/transformers/models/zoedepth/configuration_zoedepth.py +++ b/src/transformers/models/zoedepth/configuration_zoedepth.py @@ -13,20 +13,23 @@ # limitations under the License. """ZoeDepth model configuration""" +from typing import Literal + +from huggingface_hub.dataclasses import strict + from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ..auto.configuration_auto import AutoConfig -logger = logging.get_logger(__name__) - ZOEDEPTH_PRETRAINED_CONFIG_ARCHIVE_MAP = { "Intel/zoedepth-nyu": "https://huggingface.co/Intel/zoedepth-nyu/resolve/main/config.json", } @auto_docstring(checkpoint="Intel/zoedepth-nyu") +@strict(accept_kwargs=True) class ZoeDepthConfig(PreTrainedConfig): r""" batch_norm_eps (`float`, *optional*, defaults to 1e-05): @@ -110,45 +113,37 @@ class ZoeDepthConfig(PreTrainedConfig): model_type = "zoedepth" sub_configs = {"backbone_config": AutoConfig} - def __init__( - self, - backbone_config=None, - hidden_act="gelu", - initializer_range=0.02, - batch_norm_eps=1e-05, - readout_type="project", - reassemble_factors=[4, 2, 1, 0.5], - neck_hidden_sizes=[96, 192, 384, 768], - fusion_hidden_size=256, - head_in_index=-1, - use_batch_norm_in_fusion_residual=False, - use_bias_in_fusion_residual=None, - num_relative_features=32, - add_projection=False, - bottleneck_features=256, - num_attractors=[16, 8, 4, 1], - bin_embedding_dim=128, - attractor_alpha=1000, - attractor_gamma=2, - attractor_kind="mean", - min_temp=0.0212, - max_temp=50.0, - bin_centers_type="softplus", - bin_configurations=[{"n_bins": 64, "min_depth": 0.001, "max_depth": 10.0}], - num_patch_transformer_layers=None, - patch_transformer_hidden_size=None, - patch_transformer_intermediate_size=None, - patch_transformer_num_attention_heads=None, - **kwargs, - ): - if readout_type not in ["ignore", "add", "project"]: - raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']") - - if attractor_kind not in ["mean", "sum"]: - raise ValueError("Attractor_kind must be one of ['mean', 'sum']") - - backbone_config, kwargs = consolidate_backbone_kwargs_to_config( - backbone_config=backbone_config, + backbone_config: dict | PreTrainedConfig | None = None + hidden_act: str = "gelu" + initializer_range: float = 0.02 + batch_norm_eps: float = 1e-05 + readout_type: Literal["ignore", "add", "project"] = "project" + reassemble_factors: list[int | float] | tuple[int | float, ...] = (4, 2, 1, 0.5) + neck_hidden_sizes: list[int] | tuple[int, ...] = (96, 192, 384, 768) + fusion_hidden_size: int = 256 + head_in_index: int = -1 + use_batch_norm_in_fusion_residual: bool = False + use_bias_in_fusion_residual: bool | None = None + num_relative_features: int = 32 + add_projection: bool = False + bottleneck_features: int = 256 + num_attractors: list[int] | tuple[int, ...] = (16, 8, 4, 1) + bin_embedding_dim: int = 128 + attractor_alpha: int = 1000 + attractor_gamma: int = 2 + attractor_kind: Literal["mean", "sum"] = "mean" + min_temp: float = 0.0212 + max_temp: float = 50.0 + bin_centers_type: str = "softplus" + bin_configurations: list[dict] | None = None + num_patch_transformer_layers: int | None = None + patch_transformer_hidden_size: int | None = None + patch_transformer_intermediate_size: int | None = None + patch_transformer_num_attention_heads: int | None = None + + def __post_init__(self, **kwargs): + self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( + backbone_config=self.backbone_config, default_config_type="beit", default_config_kwargs={ "image_size": 384, @@ -162,37 +157,9 @@ def __init__( }, **kwargs, ) + self.bin_configurations = self.bin_configurations or [{"n_bins": 64, "min_depth": 0.001, "max_depth": 10.0}] - self.backbone_config = backbone_config - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.batch_norm_eps = batch_norm_eps - self.readout_type = readout_type - self.reassemble_factors = reassemble_factors - self.neck_hidden_sizes = neck_hidden_sizes - self.fusion_hidden_size = fusion_hidden_size - self.head_in_index = head_in_index - self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual - self.use_bias_in_fusion_residual = use_bias_in_fusion_residual - self.num_relative_features = num_relative_features - self.add_projection = add_projection - - self.bottleneck_features = bottleneck_features - self.num_attractors = num_attractors - self.bin_embedding_dim = bin_embedding_dim - self.attractor_alpha = attractor_alpha - self.attractor_gamma = attractor_gamma - self.attractor_kind = attractor_kind - self.min_temp = min_temp - self.max_temp = max_temp - self.bin_centers_type = bin_centers_type - self.bin_configurations = bin_configurations - self.num_patch_transformer_layers = num_patch_transformer_layers - self.patch_transformer_hidden_size = patch_transformer_hidden_size - self.patch_transformer_intermediate_size = patch_transformer_intermediate_size - self.patch_transformer_num_attention_heads = patch_transformer_num_attention_heads - - super().__init__(**kwargs) + super().__post_init__(**kwargs) __all__ = ["ZOEDEPTH_PRETRAINED_CONFIG_ARCHIVE_MAP", "ZoeDepthConfig"] diff --git a/src/transformers/models/zoedepth/modeling_zoedepth.py b/src/transformers/models/zoedepth/modeling_zoedepth.py index d385ca4080c2..7c58ad9901cc 100644 --- a/src/transformers/models/zoedepth/modeling_zoedepth.py +++ b/src/transformers/models/zoedepth/modeling_zoedepth.py @@ -1300,7 +1300,7 @@ def forward( if labels is not None: raise NotImplementedError("Training is not implemented yet") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/utils/auto_docstring.py b/src/transformers/utils/auto_docstring.py index 73f83e3a7f5c..525998d4618d 100644 --- a/src/transformers/utils/auto_docstring.py +++ b/src/transformers/utils/auto_docstring.py @@ -561,6 +561,55 @@ class ProcessorArgs: class ConfigArgs: + output_hidden_states = { + "description": """ + Whether or not the model should return all hidden-states. + """, + } + + chunk_size_feed_forward = { + "description": """ + The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype` + (which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved + model is `float16`, ideally we want to load it back using the minimal amount of memory needed to load + `float16` weights. + """, + } + + dtype = { + "description": """ + The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means that + the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes `n` < + sequence_length embeddings at a time. For more information on feed forward chunking, see [How does Feed + Forward Chunking work?](../glossary.html#feed-forward-chunking). + """, + } + + id2label = { + "description": """ + A map from index (for instance prediction index, or target index) to label. + """, + } + + label2id = { + "description": """ + A map from label to index for the model. + """, + } + + problem_type = { + "description": """ + Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`, + `"single_label_classification"` or `"multi_label_classification"`. + """, + } + + tokenizer_class = { + "description": """ + The class name of model's tokenizer. + """, + } + vocab_size = { "description": """ Vocabulary size of the model. Defines the number of different tokens that can be represented by the `input_ids`. @@ -2530,6 +2579,7 @@ class ClassAttrs: ARGS_TO_IGNORE = {"self", "kwargs", "args", "deprecated_arguments"} +ARGS_TO_RENAME = {"_out_features": "out_features", "_out_indices": "out_indices"} def get_indent_level(func): @@ -3241,6 +3291,8 @@ def _process_regular_parameters( ): continue + param_name = ARGS_TO_RENAME.get(param_name, param_name) + # Process parameter type and optional status param_type, optional = _process_parameter_type(param) @@ -4021,10 +4073,6 @@ def auto_method_docstring( model_name_lowercase, class_name, config_class = _get_model_info(func, parent_class) func_documentation = func.__doc__ - # Temporary workaround for config classes until #41250 is merged. We usually add docs at class-lvl - if func_documentation is None and parent_class and parent_class.__name__.endswith("Config"): - func_documentation = parent_class.__doc__ - if custom_args is not None and func_documentation is not None: func_documentation = "\n" + set_min_indent(custom_args.strip("\n"), 0) + "\n" + func_documentation elif custom_args is not None: @@ -4079,6 +4127,7 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No is_dataclass = False is_processor = False + is_config = False is_image_processor = False docstring_init = "" docstring_args = "" @@ -4118,6 +4167,10 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No source_args_dict=get_args_doc_from_source(ImageProcessorArgs), ).__doc__ elif "PreTrainedConfig" in (x.__name__ for x in cls.__mro__): + is_config = True + doc_class = cls.__doc__ + if custom_args is None and doc_class: + custom_args = doc_class docstring_init = auto_method_docstring( cls.__init__, parent_class=cls, @@ -4152,7 +4205,7 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No f"`{cls.__name__}` is not registered in the auto doc. Here are the available classes: {ClassDocstring.__dict__.keys()}.\n" "Add a `custom_intro` to the decorator if you want to use `auto_docstring` on a class not registered in the auto doc." ) - if name != [] or custom_intro is not None or is_dataclass or is_processor or is_image_processor: + if name != [] or custom_intro is not None or is_config or is_dataclass or is_processor or is_image_processor: name = name[0] if name else None formatting_kwargs = {"model_name": model_name_title} if name == "Config": @@ -4183,7 +4236,7 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No # Add the __init__ docstring if docstring_init: docstring += set_min_indent(f"\n{docstring_init}", indent_level) - elif is_dataclass: + elif is_dataclass or is_config: # No init function, we have a data class docstring += docstring_args if docstring_args else "\nArgs:\n" source_args_dict = get_args_doc_from_source(ModelOutputArgs) diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py index 379b23b58de6..d77aa0c156eb 100644 --- a/src/transformers/utils/generic.py +++ b/src/transformers/utils/generic.py @@ -840,7 +840,7 @@ def del_attribute_from_modules(module: nn.Module, key: str): def can_return_tuple(func): """ Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or - use_return_dict=False is set in the config. + return_dict=False is set in the config. Note: output.to_tuple() convert output to tuple skipping all `None` values. diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py index 2600998277a6..08d4697683b2 100644 --- a/src/transformers/utils/type_validators.py +++ b/src/transformers/utils/type_validators.py @@ -1,6 +1,9 @@ -from collections.abc import Sequence +from collections.abc import Callable, Sequence +from functools import partial from typing import Any, Union, cast +from huggingface_hub.dataclasses import as_validated_field + from ..tokenization_utils_base import PaddingStrategy, TruncationStrategy from ..video_utils import VideoMetadataType from .generic import TensorType @@ -13,6 +16,10 @@ if is_torch_available(): import torch + from ..activations import ACT2FN +else: + ACT2FN = {} + def positive_any_number(value: int | float | None = None): if value is not None and (not isinstance(value, (int, float)) or not value >= 0): @@ -123,3 +130,123 @@ def tensor_type_validator(value: str | TensorType | None = None): pass elif not isinstance(value, str) or value not in possible_names: raise ValueError(f"The tensor type should be one of {possible_names} but got tensor_type={value}") + + +@as_validated_field +def label_to_id_validation(value: str | TensorType | None = None): + possible_names = ["pt", "np", "mlx"] + if value is None: + pass + elif not isinstance(value, str) or value not in possible_names: + raise ValueError(f"The tensor type should be one of {possible_names} but got tensor_type={value}") + + +def interval( + min: int | float | None = None, + max: int | float | None = None, + exclude_min: bool = False, + exclude_max: bool = False, +) -> Callable: + """ + Parameterized validator that ensures that `value` is within the defined interval. Optionally, the interval can be + open on either side. Expected usage: `interval(min=0)(default=8)` + + Args: + min (`int` or `float`, *optional*): + Minimum value of the interval. + max (`int` or `float`, *optional*): + Maximum value of the interval. + exclude_min (`bool`, *optional*, defaults to `False`): + If True, the minimum value is excluded from the interval. + exclude_max (`bool`, *optional*, defaults to `False`): + If True, the maximum value is excluded from the interval. + """ + error_message = "Value must be" + if min is not None: + if exclude_min: + error_message += f" greater than {min}" + else: + error_message += f" greater or equal to {min}" + if min is not None and max is not None: + error_message += " and" + if max is not None: + if exclude_max: + error_message += f" smaller than {max}" + else: + error_message += f" smaller or equal to {max}" + error_message += ", got {value}." + + min = min or float("-inf") + max = max or float("inf") + + @as_validated_field + def _inner(value: int | float): + min_valid = min <= value if not exclude_min else min < value + max_valid = value <= max if not exclude_max else value < max + if not (min_valid and max_valid): + raise ValueError(error_message.format(value=value)) + + return _inner + + +@as_validated_field +def probability(value: float): + """Ensures that `value` is a valid probability number, i.e. [0,1].""" + if not 0 <= value <= 1: + raise ValueError(f"Value must be a probability between 0.0 and 1.0, got {value}.") + + +def is_divisible_by(divisor: int | float): + @as_validated_field + def _inner(value: int | float): + if value % divisor != 0: + raise ValueError(f"Value has to be divisble by {divisor} but got value={value}") + + return _inner + + +@as_validated_field +def activation_fn_key(value: str): + """Ensures that `value` is a string corresponding to an activation function.""" + # TODO (joao): in python 3.11+, we can build a Literal type from the keys of ACT2FN + if len(ACT2FN) > 0: # don't validate if we can't import ACT2FN + if value not in ACT2FN: + raise ValueError( + f"Value must be one of {list(ACT2FN.keys())}, got {value}. " + "Make sure to use a string that corresponds to an activation function." + ) + + +def tensor_shape(shape: tuple[int | str], length: int | None = None): + @as_validated_field + def validator(value: Union[Sequence["torch.Tensor"], "torch.Tensor"]): + if value is None: + return + elif not isinstance(value, (list, tuple)): + value = [value] + elif isinstance(length, int) and len(value) != length: + raise ValueError(f"Value has to be a list of length={length} but got {len(value)}") + + dimensions = {} + for tensor in value: + # Ensures that `value` is a floating point tensor in any device (cpu, cuda, xpu, ...). + # Using `torch.FloatTensor` as a type hint is discouraged if the dataclass has a `strict` + # decorator, because it enforces floating tensors only on CPU. + if not (isinstance(tensor, torch.Tensor) and tensor.is_floating_point()): + raise ValueError(f"Value has to be a floating point tensor but got value={tensor}") + + if len(tensor.shape) != len(shape): + raise ValueError(f"Expected shape {shape}, but got {tensor.shape}") + for dim, expected in zip(tensor.shape, shape): + if isinstance(expected, int) and dim != expected: + raise ValueError(f"Expected dimension {expected}, but got {dim}") + elif isinstance(expected, str): + if expected not in dimensions: + dimensions[expected] = dim + elif dimensions[expected] != dim: + raise ValueError( + f"Dimension '{expected}' takes different values: {dimensions[expected]} and {dim}." + " Please check your tensors shapes." + ) + + return partial(validator, metadata={"shape": shape, "length": length}) diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py index 7fe42bcda872..085e307512fe 100644 --- a/tests/models/aimv2/test_modeling_aimv2.py +++ b/tests/models/aimv2/test_modeling_aimv2.py @@ -184,7 +184,7 @@ class Aimv2VisionModelTest(Aimv2ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Aimv2VisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=Aimv2VisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=Aimv2VisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -311,7 +311,7 @@ class Aimv2TextModelTest(Aimv2ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Aimv2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Aimv2TextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Aimv2TextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py index a57ed59f1040..69a819ccf4ff 100644 --- a/tests/models/albert/test_modeling_albert.py +++ b/tests/models/albert/test_modeling_albert.py @@ -273,7 +273,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = AlbertModelTester(self) - self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index 16429a31b029..66fc4566598b 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -144,7 +144,7 @@ def setUp(self): self, config_class=AlignVisionConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, common_properties=["num_channels", "image_size"], ) @@ -339,7 +339,7 @@ class AlignTextModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = AlignTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=AlignTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=AlignTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index c1f46dc5cdc5..6f23ba55d214 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -137,7 +137,7 @@ class AltCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = AltCLIPVisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=AltCLIPVisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=AltCLIPVisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -296,7 +296,7 @@ def test_resize_tokens_embeddings(self): def setUp(self): self.model_tester = AltCLIPTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=AltCLIPTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=AltCLIPTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py index f8560bd47f0f..fa4f25a2cec4 100644 --- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py @@ -181,7 +181,7 @@ def is_pipeline_test_to_skip( def setUp(self): self.model_tester = ASTModelTester(self) - self.config_tester = ConfigTester(self, config_class=ASTConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ASTConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py index 2ca6648059e9..2400c35de262 100644 --- a/tests/models/beit/test_modeling_beit.py +++ b/tests/models/beit/test_modeling_beit.py @@ -264,7 +264,7 @@ class BeitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = BeitModelTester(self) - self.config_tester = ConfigTester(self, config_class=BeitConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=BeitConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 06ce9f9241d6..26883e37ce7b 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -479,7 +479,7 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2): def setUp(self): self.model_tester = BertModelTester(self) - self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/bert_generation/test_modeling_bert_generation.py b/tests/models/bert_generation/test_modeling_bert_generation.py index fe55e374b164..35d30a40a976 100644 --- a/tests/models/bert_generation/test_modeling_bert_generation.py +++ b/tests/models/bert_generation/test_modeling_bert_generation.py @@ -256,7 +256,7 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2): def setUp(self): self.model_tester = BertGenerationEncoderTester(self) - self.config_tester = ConfigTester(self, config_class=BertGenerationConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=BertGenerationConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py index dde30e1501ce..18049fbd2885 100644 --- a/tests/models/big_bird/test_modeling_big_bird.py +++ b/tests/models/big_bird/test_modeling_big_bird.py @@ -456,7 +456,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = BigBirdModelTester(self) - self.config_tester = ConfigTester(self, config_class=BigBirdConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=BigBirdConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py index da5706353c8a..c757cb37c9db 100644 --- a/tests/models/biogpt/test_modeling_biogpt.py +++ b/tests/models/biogpt/test_modeling_biogpt.py @@ -273,7 +273,7 @@ class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix def setUp(self): self.model_tester = BioGptModelTester(self) - self.config_tester = ConfigTester(self, config_class=BioGptConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=BioGptConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/bitnet/test_modeling_bitnet.py b/tests/models/bitnet/test_modeling_bitnet.py index f55597eddb43..73dea122e63f 100644 --- a/tests/models/bitnet/test_modeling_bitnet.py +++ b/tests/models/bitnet/test_modeling_bitnet.py @@ -157,7 +157,7 @@ def is_pipeline_test_to_skip( def setUp(self): self.model_tester = BitNetModelTester(self) - self.config_tester = ConfigTester(self, config_class=BitNetConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=BitNetConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index 7635fe1e18ee..47a30b88db83 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -154,7 +154,7 @@ class BlipVisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = BlipVisionModelTester(self) - self.config_tester = ConfigTester(self, config_class=BlipVisionConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=BlipVisionConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -308,7 +308,7 @@ class BlipTextModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = BlipTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=BlipTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=BlipTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py index 129a8b233b00..31bd340e2553 100644 --- a/tests/models/blip/test_modeling_blip_text.py +++ b/tests/models/blip/test_modeling_blip_text.py @@ -128,7 +128,7 @@ class BlipTextModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = BlipTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=BlipTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=BlipTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 4901748271d4..bbfaa25b36ff 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -162,7 +162,7 @@ class Blip2VisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Blip2VisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=Blip2VisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=Blip2VisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -1078,7 +1078,7 @@ def create_and_check_model(self, config, input_ids, attention_mask): result2 = model( input_ids, attention_mask=attention_mask, - return_dict=not config.use_return_dict, + return_dict=not config.return_dict, output_attentions=True, output_hidden_states=True, ) @@ -1230,7 +1230,7 @@ def create_and_check_model(self, config, pixel_values): with torch.no_grad(): result2 = model( pixel_values, - return_dict=not config.use_return_dict, + return_dict=not config.return_dict, output_attentions=True, output_hidden_states=True, ) diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index 74268c81c034..432a7a6278bf 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -57,7 +57,7 @@ def __init__( parent, hidden_act="gelu", hidden_size=64, - initializer_factor=1, + initializer_factor=1e-10, layer_norm_eps=1e-05, num_attention_heads=4, num_hidden_layers=2, @@ -108,7 +108,7 @@ def __init__( self, parent, hidden_size=64, - initializer_factor=1, + initializer_factor=1e-10, layer_norm_eps=1e-05, num_hidden_layers=2, init_layernorm_from_vision_encoder=False, @@ -329,7 +329,7 @@ def extract_output(self, outputs, model_class): def setUp(self): self.model_tester = BridgeTowerModelTester(self) - self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=37, vocab_size=99) + self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=32, vocab_size=99) def test_config(self): self.config_tester.run_common_tests() @@ -549,7 +549,7 @@ class BridgeTowerModelTrainingTest(unittest.TestCase): def setUp(self): self.model_tester = BridgeTowerModelTester(self) - self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=37, vocab_size=99) + self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=32, vocab_size=99) def _prepare_inputs_for_training(self, model_class): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/bros/test_modeling_bros.py b/tests/models/bros/test_modeling_bros.py index 5df0240b41ee..2761e99e4a2a 100644 --- a/tests/models/bros/test_modeling_bros.py +++ b/tests/models/bros/test_modeling_bros.py @@ -304,7 +304,7 @@ def is_pipeline_test_to_skip( def setUp(self): self.model_tester = BrosModelTester(self) - self.config_tester = ConfigTester(self, config_class=BrosConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=BrosConfig, hidden_size=32) def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = copy.deepcopy(inputs_dict) diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py index 9ed600ca3f09..bb14efdb7473 100644 --- a/tests/models/canine/test_modeling_canine.py +++ b/tests/models/canine/test_modeling_canine.py @@ -239,7 +239,7 @@ class CanineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = CanineModelTester(self) # we set has_text_modality to False as the config has no vocab_size attribute - self.config_tester = ConfigTester(self, config_class=CanineConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=CanineConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py index 2c3dc2457e8a..fcc1315d0e89 100644 --- a/tests/models/chameleon/test_modeling_chameleon.py +++ b/tests/models/chameleon/test_modeling_chameleon.py @@ -219,7 +219,7 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester def setUp(self): self.model_tester = ChameleonModelTester(self) - self.config_tester = ConfigTester(self, config_class=ChameleonConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ChameleonConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -284,7 +284,7 @@ class ChameleonVision2SeqModelTest(ModelTesterMixin, GenerationTesterMixin, Pipe def setUp(self): self.model_tester = ChameleonVision2SeqModelTester(self) - self.config_tester = ConfigTester(self, config_class=ChameleonConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ChameleonConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 739352bf2832..593dd6492a64 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -329,7 +329,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = ChineseCLIPTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=ChineseCLIPTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ChineseCLIPTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -406,7 +406,7 @@ class ChineseCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = ChineseCLIPVisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=ChineseCLIPVisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=ChineseCLIPVisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index fa6a197d7c31..fc61124b63bf 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -163,7 +163,7 @@ class ClapAudioModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = ClapAudioModelTester(self) - self.config_tester = ConfigTester(self, config_class=ClapAudioConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ClapAudioConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -363,7 +363,7 @@ class ClapTextModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = ClapTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=ClapTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ClapTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 9ae8cb3842d7..4dbc12f1a0f6 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -213,7 +213,7 @@ class CLIPVisionModelTest(CLIPModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = CLIPVisionModelTester(self) - self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=48) def test_config(self): self.config_tester.run_common_tests() @@ -396,7 +396,7 @@ class CLIPTextModelTest(CLIPModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = CLIPTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 7fdeb0d817bf..435e5b0e291f 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -143,7 +143,7 @@ class CLIPSegVisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = CLIPSegVisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=CLIPSegVisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=CLIPSegVisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -294,7 +294,7 @@ class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = CLIPSegTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=CLIPSegTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=CLIPSegTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py index e76d0aa8c6a1..2504108f353a 100644 --- a/tests/models/cohere/test_modeling_cohere.py +++ b/tests/models/cohere/test_modeling_cohere.py @@ -177,7 +177,7 @@ class CohereModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix def setUp(self): self.model_tester = CohereModelTester(self) - self.config_tester = ConfigTester(self, config_class=CohereConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=CohereConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/cohere2/test_modeling_cohere2.py b/tests/models/cohere2/test_modeling_cohere2.py index d5a8460637a4..84b5cad6ae2c 100644 --- a/tests/models/cohere2/test_modeling_cohere2.py +++ b/tests/models/cohere2/test_modeling_cohere2.py @@ -78,7 +78,7 @@ class Cohere2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi def setUp(self): self.model_tester = Cohere2ModelTester(self) - self.config_tester = ConfigTester(self, config_class=Cohere2Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Cohere2Config, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py index 5eb433c132a1..4e6a77e626b0 100644 --- a/tests/models/convbert/test_modeling_convbert.py +++ b/tests/models/convbert/test_modeling_convbert.py @@ -270,7 +270,7 @@ class ConvBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = ConvBertModelTester(self) - self.config_tester = ConfigTester(self, config_class=ConvBertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ConvBertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/convnext/test_modeling_convnext.py b/tests/models/convnext/test_modeling_convnext.py index 5ffc7e10ad7c..db6d3c09b7fd 100644 --- a/tests/models/convnext/test_modeling_convnext.py +++ b/tests/models/convnext/test_modeling_convnext.py @@ -184,7 +184,7 @@ def setUp(self): self, config_class=ConvNextConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, common_properties=["num_channels", "hidden_sizes"], ) diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py index cdd046de1f59..f9126e5d838d 100644 --- a/tests/models/convnextv2/test_modeling_convnextv2.py +++ b/tests/models/convnextv2/test_modeling_convnextv2.py @@ -164,7 +164,7 @@ def setUp(self): self, config_class=ConvNextV2Config, has_text_modality=False, - hidden_size=37, + hidden_size=32, common_properties=["hidden_sizes", "num_channels"], ) diff --git a/tests/models/cvt/test_modeling_cvt.py b/tests/models/cvt/test_modeling_cvt.py index 898b536aeee3..2117c2fa21fb 100644 --- a/tests/models/cvt/test_modeling_cvt.py +++ b/tests/models/cvt/test_modeling_cvt.py @@ -163,7 +163,7 @@ def setUp(self): self, config_class=CvtConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, common_properties=["hidden_size", "num_channels"], ) diff --git a/tests/models/cwm/test_configuration_cwm.py b/tests/models/cwm/test_configuration_cwm.py index 344a653dd8ab..56476d122900 100644 --- a/tests/models/cwm/test_configuration_cwm.py +++ b/tests/models/cwm/test_configuration_cwm.py @@ -14,6 +14,8 @@ import unittest +import huggingface_hub + from transformers.models.cwm import CwmConfig from transformers.testing_utils import require_torch @@ -47,14 +49,14 @@ def test_custom_layer_types_config(self): self.assertEqual(len(config.layer_types), config.num_hidden_layers) def test_invalid_layer_types_length(self): - with self.assertRaises(ValueError): + with self.assertRaises(huggingface_hub.errors.StrictDataclassClassValidationError): CwmConfig( num_hidden_layers=4, layer_types=["full_attention", "sliding_attention"], # Only 2 types for 4 layers ) def test_invalid_layer_type_value(self): - with self.assertRaises(ValueError): + with self.assertRaises(huggingface_hub.errors.StrictDataclassClassValidationError): CwmConfig(num_hidden_layers=2, layer_types=["full_attention", "invalid_attention"]) def test_automatic_layer_types_generation(self): diff --git a/tests/models/dac/test_modeling_dac.py b/tests/models/dac/test_modeling_dac.py index 0a765826ae28..8b4de999a7d4 100644 --- a/tests/models/dac/test_modeling_dac.py +++ b/tests/models/dac/test_modeling_dac.py @@ -138,7 +138,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = DacModelTester(self) self.config_tester = ConfigTester( - self, config_class=DacConfig, hidden_size=37, common_properties=[], has_text_modality=False + self, config_class=DacConfig, hidden_size=32, common_properties=[], has_text_modality=False ) def test_config(self): diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py index 57aa199415bf..60a03fca2952 100644 --- a/tests/models/data2vec/test_modeling_data2vec_audio.py +++ b/tests/models/data2vec/test_modeling_data2vec_audio.py @@ -356,7 +356,7 @@ class Data2VecAudioModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes def setUp(self): self.model_tester = Data2VecAudioModelTester(self) - self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/data2vec/test_modeling_data2vec_text.py b/tests/models/data2vec/test_modeling_data2vec_text.py index d06d696f4344..70c4995e209b 100644 --- a/tests/models/data2vec/test_modeling_data2vec_text.py +++ b/tests/models/data2vec/test_modeling_data2vec_text.py @@ -391,7 +391,7 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2): def setUp(self): self.model_tester = Data2VecTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py index c4825356e74d..a1e4cd8cb0d7 100644 --- a/tests/models/data2vec/test_modeling_data2vec_vision.py +++ b/tests/models/data2vec/test_modeling_data2vec_vision.py @@ -205,7 +205,7 @@ class Data2VecVisionModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te def setUp(self): self.model_tester = Data2VecVisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=Data2VecVisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=Data2VecVisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py index de41fc067aa3..489092e13de0 100644 --- a/tests/models/dbrx/test_modeling_dbrx.py +++ b/tests/models/dbrx/test_modeling_dbrx.py @@ -37,7 +37,7 @@ def __init__( clip_qkv=8, rope_theta=500000, attn_config_model_type="", - moe_jitter_eps=0, + moe_jitter_eps=0.0, moe_loss_weight=0.05, moe_num_experts=8, moe_top_k=4, diff --git a/tests/models/deberta/test_modeling_deberta.py b/tests/models/deberta/test_modeling_deberta.py index adc9659afcd4..35dd648c1707 100644 --- a/tests/models/deberta/test_modeling_deberta.py +++ b/tests/models/deberta/test_modeling_deberta.py @@ -240,7 +240,7 @@ class DebertaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) def setUp(self): self.model_tester = DebertaModelTester(self) - self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/deberta_v2/test_modeling_deberta_v2.py b/tests/models/deberta_v2/test_modeling_deberta_v2.py index 42d5645e4091..4c0bb4fb4fde 100644 --- a/tests/models/deberta_v2/test_modeling_deberta_v2.py +++ b/tests/models/deberta_v2/test_modeling_deberta_v2.py @@ -254,7 +254,7 @@ class DebertaV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas def setUp(self): self.model_tester = DebertaV2ModelTester(self) - self.config_tester = ConfigTester(self, config_class=DebertaV2Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DebertaV2Config, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/decision_transformer/test_modeling_decision_transformer.py b/tests/models/decision_transformer/test_modeling_decision_transformer.py index 7b1d8690c0e3..d26b540e707e 100644 --- a/tests/models/decision_transformer/test_modeling_decision_transformer.py +++ b/tests/models/decision_transformer/test_modeling_decision_transformer.py @@ -140,7 +140,7 @@ class DecisionTransformerModelTest(ModelTesterMixin, PipelineTesterMixin, unitte def setUp(self): self.model_tester = DecisionTransformerModelTester(self) - self.config_tester = ConfigTester(self, config_class=DecisionTransformerConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DecisionTransformerConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py index 5cbaef4b57ae..63b02953b309 100644 --- a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py +++ b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py @@ -249,7 +249,7 @@ class DeepseekV3ModelTest( def setUp(self): self.model_tester = DeepseekV3ModelTester(self) - self.config_tester = ConfigTester(self, config_class=DeepseekV3Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DeepseekV3Config, hidden_size=32) def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_length, config): """Needs to be overridden as deepseek has special MLA cache format (though we don't really use the MLA)""" diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py index ab7a947467a2..6e119963df1f 100644 --- a/tests/models/deit/test_modeling_deit.py +++ b/tests/models/deit/test_modeling_deit.py @@ -223,7 +223,7 @@ class DeiTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = DeiTModelTester(self) - self.config_tester = ConfigTester(self, config_class=DeiTConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DeiTConfig, has_text_modality=False, hidden_size=32) @unittest.skip( "Since `torch==2.3+cu121`, although this test passes, many subsequent tests have `CUDA error: misaligned address`." diff --git a/tests/models/depth_anything/test_modeling_depth_anything.py b/tests/models/depth_anything/test_modeling_depth_anything.py index 500245c8fc07..1673e49b2a90 100644 --- a/tests/models/depth_anything/test_modeling_depth_anything.py +++ b/tests/models/depth_anything/test_modeling_depth_anything.py @@ -149,7 +149,7 @@ def setUp(self): self, config_class=DepthAnythingConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, common_properties=["patch_size"], ) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 967707dbbed5..97157264b4ba 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -214,7 +214,7 @@ class DepthProModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = DepthProModelTester(self) - self.config_tester = ConfigTester(self, config_class=DepthProConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DepthProConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/diffllama/test_modeling_diffllama.py b/tests/models/diffllama/test_modeling_diffllama.py index 38af14223dde..759e288a43e5 100644 --- a/tests/models/diffllama/test_modeling_diffllama.py +++ b/tests/models/diffllama/test_modeling_diffllama.py @@ -199,7 +199,7 @@ class DiffLlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester def setUp(self): self.model_tester = DiffLlamaModelTester(self) - self.config_tester = ConfigTester(self, config_class=DiffLlamaConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DiffLlamaConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py index 8fd4203af535..46a78d6049cb 100644 --- a/tests/models/dinov2/test_modeling_dinov2.py +++ b/tests/models/dinov2/test_modeling_dinov2.py @@ -232,7 +232,7 @@ class Dinov2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Dinov2ModelTester(self) - self.config_tester = ConfigTester(self, config_class=Dinov2Config, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Dinov2Config, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py index f589493b55c1..e055c06395c2 100644 --- a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py +++ b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py @@ -240,7 +240,7 @@ class Dinov2WithRegistersModelTest(ModelTesterMixin, PipelineTesterMixin, unitte def setUp(self): self.model_tester = Dinov2WithRegistersModelTester(self) self.config_tester = ConfigTester( - self, config_class=Dinov2WithRegistersConfig, has_text_modality=False, hidden_size=37 + self, config_class=Dinov2WithRegistersConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): diff --git a/tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py b/tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py index c5947ea88b19..7b9566d75141 100644 --- a/tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py +++ b/tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py @@ -174,7 +174,7 @@ def setUp(self): self, config_class=DINOv3ConvNextConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, common_properties=["num_channels", "hidden_sizes"], ) diff --git a/tests/models/dinov3_vit/test_modeling_dinov3_vit.py b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py index 0a696581b7fe..672fea3c61b1 100644 --- a/tests/models/dinov3_vit/test_modeling_dinov3_vit.py +++ b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py @@ -112,7 +112,7 @@ def get_config(self): is_decoder=False, initializer_range=self.initializer_range, num_register_tokens=self.num_register_tokens, - stage_names=["embeddings"] + [f"stage{i}" for i in range(1, self.num_hidden_layers + 1)], + stage_names=["stem"] + [f"stage{i}" for i in range(1, self.num_hidden_layers + 1)], out_indices=[0, 1], reshape_hidden_states=True, ) @@ -202,7 +202,7 @@ class Dinov3ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = DINOv3ViTModelTester(self) - self.config_tester = ConfigTester(self, config_class=DINOv3ViTConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DINOv3ViTConfig, has_text_modality=False, hidden_size=32) def test_backbone(self): config, pixel_values, labels = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/dpr/test_modeling_dpr.py b/tests/models/dpr/test_modeling_dpr.py index 22db4fd48f2d..d1e5c57fc22d 100644 --- a/tests/models/dpr/test_modeling_dpr.py +++ b/tests/models/dpr/test_modeling_dpr.py @@ -190,7 +190,7 @@ class DPRModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = DPRModelTester(self) - self.config_tester = ConfigTester(self, config_class=DPRConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DPRConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py index 3e3114be80ad..a0ba2063ea6c 100644 --- a/tests/models/dpt/test_modeling_dpt.py +++ b/tests/models/dpt/test_modeling_dpt.py @@ -173,7 +173,7 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = DPTModelTester(self) - self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py index bee78e73370a..a4009caa6d2f 100644 --- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py +++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py @@ -140,7 +140,7 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = DPTModelTester(self) - self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py index a17c9a6793e3..cfe03c5e2c57 100644 --- a/tests/models/dpt/test_modeling_dpt_hybrid.py +++ b/tests/models/dpt/test_modeling_dpt_hybrid.py @@ -186,7 +186,7 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = DPTModelTester(self) - self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/efficientnet/test_modeling_efficientnet.py b/tests/models/efficientnet/test_modeling_efficientnet.py index 27acb6461f2b..260874bde4a8 100644 --- a/tests/models/efficientnet/test_modeling_efficientnet.py +++ b/tests/models/efficientnet/test_modeling_efficientnet.py @@ -143,7 +143,7 @@ def setUp(self): self, config_class=EfficientNetConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, common_properties=["num_channels", "image_size", "hidden_dim"], ) diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py index 72f0a5e6bba6..6c8cf3a22036 100644 --- a/tests/models/electra/test_modeling_electra.py +++ b/tests/models/electra/test_modeling_electra.py @@ -421,7 +421,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = ElectraModelTester(self) - self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/emu3/test_modeling_emu3.py b/tests/models/emu3/test_modeling_emu3.py index 39902bb7b350..6b356bb659c9 100644 --- a/tests/models/emu3/test_modeling_emu3.py +++ b/tests/models/emu3/test_modeling_emu3.py @@ -133,7 +133,7 @@ class Emu3Text2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTe def setUp(self): self.model_tester = Emu3Text2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Emu3TextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Emu3TextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -297,7 +297,7 @@ class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline def setUp(self): self.model_tester = Emu3Vision2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Emu3Config, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Emu3Config, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py index 95030cbcb373..644920744314 100644 --- a/tests/models/encodec/test_modeling_encodec.py +++ b/tests/models/encodec/test_modeling_encodec.py @@ -161,7 +161,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = EncodecModelTester(self) self.config_tester = ConfigTester( - self, config_class=EncodecConfig, hidden_size=37, common_properties=[], has_text_modality=False + self, config_class=EncodecConfig, hidden_size=32, common_properties=[], has_text_modality=False ) def test_config(self): @@ -223,7 +223,7 @@ def test_feed_forward_chunking(self): torch.manual_seed(0) config.chunk_length_s = 2 - config.overlap = 0 + config.overlap = 0.0 config.sampling_rate = 20 model = model_class(config) diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py index 7e846019bb1c..f034d3c6546b 100644 --- a/tests/models/ernie/test_modeling_ernie.py +++ b/tests/models/ernie/test_modeling_ernie.py @@ -474,7 +474,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = ErnieModelTester(self) - self.config_tester = ConfigTester(self, config_class=ErnieConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ErnieConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py index be0eef266de4..5654659e1921 100644 --- a/tests/models/esm/test_modeling_esm.py +++ b/tests/models/esm/test_modeling_esm.py @@ -225,7 +225,7 @@ class EsmModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = EsmModelTester(self) - self.config_tester = ConfigTester(self, config_class=EsmConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=EsmConfig, hidden_size=48) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py index d19f47714ef3..601bd6b581b1 100644 --- a/tests/models/esm/test_modeling_esmfold.py +++ b/tests/models/esm/test_modeling_esmfold.py @@ -174,7 +174,7 @@ class EsmFoldModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) def setUp(self): self.model_tester = EsmFoldModelTester(self) - self.config_tester = ConfigTester(self, config_class=EsmConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=EsmConfig, hidden_size=48) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/eurobert/test_modeling_eurobert.py b/tests/models/eurobert/test_modeling_eurobert.py index 9b66107ef866..fc74ef4d7e3d 100644 --- a/tests/models/eurobert/test_modeling_eurobert.py +++ b/tests/models/eurobert/test_modeling_eurobert.py @@ -225,7 +225,7 @@ class EuroBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = EuroBertModelTester(self) - self.config_tester = ConfigTester(self, config_class=EuroBertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=EuroBertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/evolla/test_modeling_evolla.py b/tests/models/evolla/test_modeling_evolla.py index d634a01d34f0..8624466ef10d 100644 --- a/tests/models/evolla/test_modeling_evolla.py +++ b/tests/models/evolla/test_modeling_evolla.py @@ -205,7 +205,7 @@ class EvollaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = EvollaModelTester(self) - self.config_tester = ConfigTester(self, config_class=EvollaConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=EvollaConfig, hidden_size=32) @property def is_encoder_decoder(self): diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index bd8526797a14..245a818300b9 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -166,7 +166,7 @@ class FlavaImageModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = FlavaImageModelTester(self) - self.config_tester = ConfigTester(self, config_class=FlavaImageConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=FlavaImageConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -428,7 +428,7 @@ class FlavaTextModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = FlavaTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=FlavaTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=FlavaTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -562,7 +562,7 @@ class FlavaMultimodalModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = FlavaMultimodalModelTester(self) self.config_tester = ConfigTester( - self, config_class=FlavaMultimodalConfig, has_text_modality=False, hidden_size=37 + self, config_class=FlavaMultimodalConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -767,7 +767,7 @@ def __init__( self.multimodal_model_tester = FlavaMultimodalModelTester(parent, **multimodal_kwargs) self.image_codebook_tester = FlavaImageCodebookTester(parent, **image_codebook_kwargs) self.is_training = is_training - self.config_tester = ConfigTester(self, config_class=FlavaConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=FlavaConfig, hidden_size=32) self.hidden_size = hidden_size self.projection_dim = projection_dim self.initializer_range = initializer_range diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py index 8d1a04445ae9..4051785e77d6 100644 --- a/tests/models/fnet/test_modeling_fnet.py +++ b/tests/models/fnet/test_modeling_fnet.py @@ -388,7 +388,7 @@ def test_retain_grad_hidden_states_attentions(self): def setUp(self): self.model_tester = FNetModelTester(self) - self.config_tester = FNetConfigTester(self, config_class=FNetConfig, hidden_size=37) + self.config_tester = FNetConfigTester(self, config_class=FNetConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/gemma3n/test_modeling_gemma3n.py b/tests/models/gemma3n/test_modeling_gemma3n.py index 83c974957c48..1cd7fc8d4697 100644 --- a/tests/models/gemma3n/test_modeling_gemma3n.py +++ b/tests/models/gemma3n/test_modeling_gemma3n.py @@ -149,7 +149,7 @@ class Gemma3nAudioModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Gemma3nAudioModelTester(self) - self.config_tester = ConfigTester(self, config_class=Gemma3nAudioConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Gemma3nAudioConfig, hidden_size=32) torch.manual_seed(0) # The following values are golden outputs from a deterministic run of the components. @@ -799,7 +799,7 @@ def setUp(self): self.config_tester = ConfigTester( self, config_class=Gemma3nConfig, - hidden_size=37, + hidden_size=32, text_config={"activation_sparsity_pattern": None}, ) diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 030b85e51765..063e423b8cd9 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -131,7 +131,7 @@ class GitVisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = GitVisionModelTester(self) - self.config_tester = ConfigTester(self, config_class=GitVisionConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=GitVisionConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -393,7 +393,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = GitModelTester(self) - self.config_tester = ConfigTester(self, config_class=GitConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=GitConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py index ea39a07cc135..0c237c6ae308 100644 --- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py +++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py @@ -543,8 +543,8 @@ def get_attention(self, multi_query): config = GPTBigCodeConfig.from_pretrained( "bigcode/gpt_bigcode-santacoder", multi_query=multi_query, - attn_pdrop=0, - resid_pdrop=0, + attn_pdrop=0.0, + resid_pdrop=0.0, ) # We need to set it here as it's normally set by the Model's __init__ config._attn_implementation = "sdpa" diff --git a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py index 6de7193ec08a..82091d84b9c7 100644 --- a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py +++ b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py @@ -207,7 +207,7 @@ class GPTNeoXModelJapaneseTest(ModelTesterMixin, GenerationTesterMixin, Pipeline def setUp(self): self.model_tester = GPTNeoXJapaneseModelTester(self) - self.config_tester = ConfigTester(self, config_class=GPTNeoXJapaneseConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=GPTNeoXJapaneseConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py index b1f12981d4db..c3923151e314 100644 --- a/tests/models/granite/test_modeling_granite.py +++ b/tests/models/granite/test_modeling_granite.py @@ -180,7 +180,7 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi def setUp(self): self.model_tester = GraniteModelTester(self) - self.config_tester = ConfigTester(self, config_class=GraniteConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=GraniteConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py index 6e0cd53ed6a5..35cdb6012bdb 100644 --- a/tests/models/granitemoe/test_modeling_granitemoe.py +++ b/tests/models/granitemoe/test_modeling_granitemoe.py @@ -179,7 +179,7 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test def setUp(self): self.model_tester = GraniteMoeModelTester(self) - self.config_tester = ConfigTester(self, config_class=GraniteMoeConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=GraniteMoeConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py b/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py index 32246fe0212d..8cb946d0aa2e 100644 --- a/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py +++ b/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py @@ -18,6 +18,7 @@ import unittest import pytest +from huggingface_hub.errors import StrictDataclassClassValidationError from parameterized import parameterized from pytest import mark @@ -350,7 +351,7 @@ def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_l def test_config_requires_mamba_or_attention_layers(self): """Ensure we can't create a config with disallowed layers.""" - with pytest.raises(ValueError): + with pytest.raises(StrictDataclassClassValidationError): GraniteMoeHybridConfig(layer_types=["not allowed!"]) diff --git a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py index c86100c4c112..8feed0e7db9f 100644 --- a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py +++ b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py @@ -182,7 +182,7 @@ class GraniteMoeSharedModelTest(ModelTesterMixin, GenerationTesterMixin, unittes def setUp(self): self.model_tester = GraniteMoeSharedModelTester(self) - self.config_tester = ConfigTester(self, config_class=GraniteMoeSharedConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=GraniteMoeSharedConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index 791c43a3de72..45f0fb5dcd7c 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -146,7 +146,7 @@ class GroupViTVisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = GroupViTVisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=GroupViTVisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=GroupViTVisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -424,7 +424,7 @@ class GroupViTTextModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = GroupViTTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=GroupViTTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=GroupViTTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index 7eccaea93daa..3948f2dfe508 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -309,7 +309,7 @@ class HubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = HubertModelTester(self) - self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -430,7 +430,7 @@ def setUp(self): self.model_tester = HubertModelTester( self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True ) - self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py index 720b8d07f659..cd3662194e73 100644 --- a/tests/models/ibert/test_modeling_ibert.py +++ b/tests/models/ibert/test_modeling_ibert.py @@ -253,7 +253,7 @@ class IBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = IBertModelTester(self) - self.config_tester = ConfigTester(self, config_class=IBertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=IBertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 0fdb84df46da..c0f185168e99 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -360,7 +360,7 @@ def test_model_outputs_equivalence(self): def setUp(self): self.model_tester = IdeficsModelTester(self) - self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -687,7 +687,7 @@ def setUp(self): self, modality_type_vocab_size=3, ) - self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=32) @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) @unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.") diff --git a/tests/models/ijepa/test_modeling_ijepa.py b/tests/models/ijepa/test_modeling_ijepa.py index ad690ef9a721..51a8808c7cfa 100644 --- a/tests/models/ijepa/test_modeling_ijepa.py +++ b/tests/models/ijepa/test_modeling_ijepa.py @@ -210,7 +210,7 @@ def setUp(self): self, config_class=IJepaConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, ) @unittest.skip( diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py index 8626f70a314e..f54f6e571509 100644 --- a/tests/models/jamba/test_modeling_jamba.py +++ b/tests/models/jamba/test_modeling_jamba.py @@ -17,6 +17,7 @@ import tempfile import unittest +import huggingface_hub import pytest from transformers import AutoTokenizer, BitsAndBytesConfig, JambaConfig, is_torch_available @@ -70,9 +71,9 @@ def test_attn_offsets(self): self._create_attn_config(attn_layer_offset=1, attn_layer_period=4) self._create_attn_config(attn_layer_offset=2, attn_layer_period=4) self._create_attn_config(attn_layer_offset=3, attn_layer_period=4) - with self.parent.assertRaises(ValueError): + with self.parent.assertRaises(huggingface_hub.errors.StrictDataclassClassValidationError): self._create_attn_config(attn_layer_offset=4, attn_layer_period=4) - with self.parent.assertRaises(ValueError): + with self.parent.assertRaises(huggingface_hub.errors.StrictDataclassClassValidationError): self._create_attn_config(attn_layer_offset=5, attn_layer_period=4) def test_expert_offsets(self): @@ -80,9 +81,9 @@ def test_expert_offsets(self): self._create_expert_config(expert_layer_offset=1, expert_layer_period=4) self._create_expert_config(expert_layer_offset=2, expert_layer_period=4) self._create_expert_config(expert_layer_offset=3, expert_layer_period=4) - with self.parent.assertRaises(ValueError): + with self.parent.assertRaises(huggingface_hub.errors.StrictDataclassClassValidationError): self._create_expert_config(expert_layer_offset=4, expert_layer_period=4) - with self.parent.assertRaises(ValueError): + with self.parent.assertRaises(huggingface_hub.errors.StrictDataclassClassValidationError): self._create_expert_config(expert_layer_offset=5, expert_layer_period=4) def test_jamba_offset_properties(self): @@ -381,7 +382,7 @@ def _check_caches_are_equal( def setUp(self): self.model_tester = JambaModelTester(self) - self.config_tester = JambaConfigTester(self, config_class=JambaConfig, hidden_size=37) + self.config_tester = JambaConfigTester(self, config_class=JambaConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 933681afb6d4..339fce4176b5 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -226,8 +226,8 @@ def prepare_config_and_inputs(self): def get_config(self): return Kosmos2Config( - self.text_model_tester.get_config().to_dict(), - self.vision_model_tester.get_config().to_dict(), + text_config=self.text_model_tester.get_config().to_dict(), + vision_config=self.vision_model_tester.get_config().to_dict(), latent_query_num=self.latent_query_num, ) diff --git a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py index 1dd0b53f9b99..530c721ea5a5 100644 --- a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py +++ b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py @@ -72,8 +72,8 @@ def __init__( intermediate_size=64, num_hidden_layers=2, num_attention_heads=4, - dropout=0, - attention_dropout=0, + dropout=0.0, + attention_dropout=0.0, scope=None, ): self.parent = parent @@ -136,8 +136,8 @@ def __init__( ffn_dim=64, num_hidden_layers=2, num_attention_heads=4, - dropout=0, - attention_dropout=0, + dropout=0.0, + attention_dropout=0.0, max_position_embeddings=512, scope=None, ): @@ -236,8 +236,8 @@ def prepare_config_and_inputs(self): def get_config(self): return Kosmos2_5Config( - self.text_model_tester.get_config().to_dict(), - self.vision_model_tester.get_config().to_dict(), + text_config=self.text_model_tester.get_config().to_dict(), + vision_config=self.vision_model_tester.get_config().to_dict(), latent_query_num=self.latent_query_num, ) @@ -342,7 +342,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = Kosmos2_5ModelTester(self) - self.config_tester = ConfigTester(self, config_class=Kosmos2_5Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Kosmos2_5Config, hidden_size=32) @unittest.skip("KOSMOS-2.5 doesn't support padding") def test_eager_padding_matches_padding_free_with_position_ids(self): diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py index 48ccc3d01832..e065de073772 100644 --- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py +++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py @@ -261,7 +261,7 @@ class KyutaiSpeechToTextModelTest(ModelTesterMixin, GenerationTesterMixin, Pipel def setUp(self): self.model_tester = KyutaiSpeechToTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=KyutaiSpeechToTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=KyutaiSpeechToTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/lasr/test_modeling_lasr.py b/tests/models/lasr/test_modeling_lasr.py index 7aa065380307..36060eecac3b 100644 --- a/tests/models/lasr/test_modeling_lasr.py +++ b/tests/models/lasr/test_modeling_lasr.py @@ -215,7 +215,7 @@ def prepare_config_and_inputs(self): return config, input_features, attention_mask def get_config(self): - return LasrCTCConfig.from_encoder_config( + return LasrCTCConfig( encoder_config=self.encoder_model_tester.get_config(), vocab_size=self.vocab_size, pad_token_id=self.pad_token_id, diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py index cea811b7512e..63924d3b72d6 100644 --- a/tests/models/layoutlm/test_modeling_layoutlm.py +++ b/tests/models/layoutlm/test_modeling_layoutlm.py @@ -248,7 +248,7 @@ class LayoutLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = LayoutLMModelTester(self) - self.config_tester = ConfigTester(self, config_class=LayoutLMConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=LayoutLMConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py index 6711681ac98a..df6fa697947f 100644 --- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py @@ -285,7 +285,7 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa def setUp(self): self.model_tester = LayoutLMv2ModelTester(self) - self.config_tester = ConfigTester(self, config_class=LayoutLMv2Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=LayoutLMv2Config, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py index 1c7698342929..be4e87b74388 100644 --- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py @@ -307,7 +307,7 @@ def is_pipeline_test_to_skip( def setUp(self): self.model_tester = LayoutLMv3ModelTester(self) - self.config_tester = ConfigTester(self, config_class=LayoutLMv3Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=LayoutLMv3Config, hidden_size=32) def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = copy.deepcopy(inputs_dict) diff --git a/tests/models/lightglue/test_modeling_lightglue.py b/tests/models/lightglue/test_modeling_lightglue.py index 96a684a41def..2a8f8f752890 100644 --- a/tests/models/lightglue/test_modeling_lightglue.py +++ b/tests/models/lightglue/test_modeling_lightglue.py @@ -133,7 +133,7 @@ class LightGlueModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = LightGlueModelTester(self) - self.config_tester = ConfigTester(self, config_class=LightGlueConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=LightGlueConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.create_and_test_config_to_json_string() diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py index 805a973a8d18..360361b5aff5 100644 --- a/tests/models/lilt/test_modeling_lilt.py +++ b/tests/models/lilt/test_modeling_lilt.py @@ -255,7 +255,7 @@ def is_pipeline_test_to_skip( def setUp(self): self.model_tester = LiltModelTester(self) - self.config_tester = ConfigTester(self, config_class=LiltConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=LiltConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py index 0e257818ddea..5239f96a0fc4 100644 --- a/tests/models/longformer/test_modeling_longformer.py +++ b/tests/models/longformer/test_modeling_longformer.py @@ -357,7 +357,7 @@ def is_pipeline_test_to_skip( def setUp(self): self.model_tester = LongformerModelTester(self) - self.config_tester = ConfigTester(self, config_class=LongformerConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=LongformerConfig, hidden_size=32) # Without this, 0.01% failure rate. @is_flaky( diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py index b84915346f25..f8cfec6d589a 100644 --- a/tests/models/luke/test_modeling_luke.py +++ b/tests/models/luke/test_modeling_luke.py @@ -693,7 +693,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = LukeModelTester(self) - self.config_tester = ConfigTester(self, config_class=LukeConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=LukeConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/lxmert/test_modeling_lxmert.py b/tests/models/lxmert/test_modeling_lxmert.py index 7ed5188c03cc..4a9113852626 100644 --- a/tests/models/lxmert/test_modeling_lxmert.py +++ b/tests/models/lxmert/test_modeling_lxmert.py @@ -543,7 +543,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = LxmertModelTester(self) - self.config_tester = ConfigTester(self, config_class=LxmertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=LxmertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/markuplm/test_modeling_markuplm.py b/tests/models/markuplm/test_modeling_markuplm.py index f76c497475e1..4ef1a898c92e 100644 --- a/tests/models/markuplm/test_modeling_markuplm.py +++ b/tests/models/markuplm/test_modeling_markuplm.py @@ -314,7 +314,7 @@ def is_pipeline_test_to_skip( def setUp(self): self.model_tester = MarkupLMModelTester(self) - self.config_tester = ConfigTester(self, config_class=MarkupLMConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=MarkupLMConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py index 97651d8dae32..2222d22c527f 100644 --- a/tests/models/megatron_bert/test_modeling_megatron_bert.py +++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py @@ -305,7 +305,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = MegatronBertModelTester(self) - self.config_tester = ConfigTester(self, config_class=MegatronBertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=MegatronBertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/metaclip_2/test_modeling_metaclip_2.py b/tests/models/metaclip_2/test_modeling_metaclip_2.py index fde8d700531f..496c6f181002 100644 --- a/tests/models/metaclip_2/test_modeling_metaclip_2.py +++ b/tests/models/metaclip_2/test_modeling_metaclip_2.py @@ -214,7 +214,7 @@ class MetaClip2VisionModelTest(MetaClip2ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = MetaClip2VisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=MetaClip2VisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=MetaClip2VisionConfig, has_text_modality=False, hidden_size=36 ) def test_config(self): @@ -403,7 +403,7 @@ class MetaClip2TextModelTest(MetaClip2ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = MetaClip2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=MetaClip2TextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=MetaClip2TextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py index 46939bc038e6..5e48c891345d 100644 --- a/tests/models/mimi/test_modeling_mimi.py +++ b/tests/models/mimi/test_modeling_mimi.py @@ -179,7 +179,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = MimiModelTester(self) self.config_tester = ConfigTester( - self, config_class=MimiConfig, hidden_size=37, common_properties=[], has_text_modality=False + self, config_class=MimiConfig, hidden_size=32, common_properties=[], has_text_modality=False ) def test_config(self): diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py index 53bff948156b..5acdfc74aa5c 100644 --- a/tests/models/mobilebert/test_modeling_mobilebert.py +++ b/tests/models/mobilebert/test_modeling_mobilebert.py @@ -301,7 +301,7 @@ def test_resize_tokens_embeddings(self): def setUp(self): self.model_tester = MobileBertModelTester(self) - self.config_tester = ConfigTester(self, config_class=MobileBertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=MobileBertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/modernbert/test_modeling_modernbert.py b/tests/models/modernbert/test_modeling_modernbert.py index 2901b11b40ec..7f7b5fa87f15 100644 --- a/tests/models/modernbert/test_modeling_modernbert.py +++ b/tests/models/modernbert/test_modeling_modernbert.py @@ -267,7 +267,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = ModernBertModelTester(self) - self.config_tester = ConfigTester(self, config_class=ModernBertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ModernBertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/mpnet/test_modeling_mpnet.py b/tests/models/mpnet/test_modeling_mpnet.py index 2c33183aa94b..0db157eee9a3 100644 --- a/tests/models/mpnet/test_modeling_mpnet.py +++ b/tests/models/mpnet/test_modeling_mpnet.py @@ -216,7 +216,7 @@ class MPNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = MPNetModelTester(self) - self.config_tester = ConfigTester(self, config_class=MPNetConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=MPNetConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py index 286b8a6005a5..d006947d64d3 100644 --- a/tests/models/mra/test_modeling_mra.py +++ b/tests/models/mra/test_modeling_mra.py @@ -280,7 +280,7 @@ class MraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = MraModelTester(self) - self.config_tester = ConfigTester(self, config_class=MraConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=MraConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index 48f309437a7c..5760db8b3552 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -550,7 +550,9 @@ def get_config(self): tie_word_embeddings=False, audio_channels=self.audio_channels, ) - config = MusicgenConfig(text_encoder_config, audio_encoder_config, decoder_config) + config = MusicgenConfig( + text_encoder=text_encoder_config, audio_encoder=audio_encoder_config, decoder=decoder_config + ) return config def prepare_config_and_inputs_for_common(self): diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index 2af4810f686f..04874c600d42 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -569,7 +569,10 @@ def get_config(self): audio_channels=self.audio_channels, ) config = MusicgenMelodyConfig( - text_encoder_config, audio_encoder_config, decoder_config, chroma_length=self.chroma_length + text_encoder=text_encoder_config, + audio_encoder=audio_encoder_config, + decoder=decoder_config, + chroma_length=self.chroma_length, ) return config diff --git a/tests/models/nystromformer/test_modeling_nystromformer.py b/tests/models/nystromformer/test_modeling_nystromformer.py index d9a1afb7c392..167ae7f78daf 100644 --- a/tests/models/nystromformer/test_modeling_nystromformer.py +++ b/tests/models/nystromformer/test_modeling_nystromformer.py @@ -241,7 +241,7 @@ class NystromformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes def setUp(self): self.model_tester = NystromformerModelTester(self) - self.config_tester = ConfigTester(self, config_class=NystromformerConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=NystromformerConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py index c070ab62bed1..1bc4a7246698 100644 --- a/tests/models/olmo/test_modeling_olmo.py +++ b/tests/models/olmo/test_modeling_olmo.py @@ -177,7 +177,7 @@ class OlmoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin def setUp(self): self.model_tester = OlmoModelTester(self) - self.config_tester = ConfigTester(self, config_class=OlmoConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=OlmoConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/olmo2/test_modeling_olmo2.py b/tests/models/olmo2/test_modeling_olmo2.py index 56f48c06d970..559b17d059d1 100644 --- a/tests/models/olmo2/test_modeling_olmo2.py +++ b/tests/models/olmo2/test_modeling_olmo2.py @@ -178,7 +178,7 @@ class Olmo2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi def setUp(self): self.model_tester = Olmo2ModelTester(self) - self.config_tester = ConfigTester(self, config_class=Olmo2Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Olmo2Config, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/olmoe/test_modeling_olmoe.py b/tests/models/olmoe/test_modeling_olmoe.py index 8da7e9b6d23e..78f2019e88b3 100644 --- a/tests/models/olmoe/test_modeling_olmoe.py +++ b/tests/models/olmoe/test_modeling_olmoe.py @@ -189,7 +189,7 @@ class OlmoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi def setUp(self): self.model_tester = OlmoeModelTester(self) - self.config_tester = ConfigTester(self, config_class=OlmoeConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=OlmoeConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py index 5d95d30c7e17..729d898ef236 100644 --- a/tests/models/owlv2/test_modeling_owlv2.py +++ b/tests/models/owlv2/test_modeling_owlv2.py @@ -147,7 +147,7 @@ class Owlv2VisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Owlv2VisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=Owlv2VisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=Owlv2VisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -302,7 +302,7 @@ class Owlv2TextModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Owlv2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Owlv2TextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Owlv2TextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index cd0ae82a94ef..0d614eeacfb2 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -145,7 +145,7 @@ class OwlViTVisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = OwlViTVisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=OwlViTVisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=OwlViTVisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -298,7 +298,7 @@ class OwlViTTextModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = OwlViTTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=OwlViTTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=OwlViTTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/parakeet/test_modeling_parakeet.py b/tests/models/parakeet/test_modeling_parakeet.py index 0d23383a130a..b1de3904bba0 100644 --- a/tests/models/parakeet/test_modeling_parakeet.py +++ b/tests/models/parakeet/test_modeling_parakeet.py @@ -52,7 +52,7 @@ def __init__( num_attention_heads=4, intermediate_size=256, hidden_act="silu", - dropout=0, # so gradient checkpointing doesn't fail + dropout=0.0, # so gradient checkpointing doesn't fail conv_kernel_size=9, subsampling_factor=8, subsampling_conv_channels=32, @@ -207,7 +207,7 @@ def prepare_config_and_inputs(self): return config, input_features, attention_mask def get_config(self): - return ParakeetCTCConfig.from_encoder_config( + return ParakeetCTCConfig( encoder_config=self.encoder_model_tester.get_config(), vocab_size=self.vocab_size, pad_token_id=self.pad_token_id, diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 3aed6a70eaca..9d067a494e19 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -71,7 +71,7 @@ def __init__( seed=42, num_targets=2, mask_type="random", - random_mask_ratio=0, + random_mask_ratio=0.0, ): self.parent = parent self.batch_size = batch_size diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py index 41dd1fa516cc..1ab3227c0660 100644 --- a/tests/models/perceiver/test_modeling_perceiver.py +++ b/tests/models/perceiver/test_modeling_perceiver.py @@ -313,7 +313,7 @@ def setUp(self): self.config_tester = ConfigTester( self, config_class=PerceiverConfig, - hidden_size=37, + hidden_size=32, common_properties=["d_model", "num_self_attention_heads", "num_cross_attention_heads"], ) diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py index 30a0acedc28b..da1edcf51b20 100644 --- a/tests/models/pix2struct/test_modeling_pix2struct.py +++ b/tests/models/pix2struct/test_modeling_pix2struct.py @@ -149,7 +149,7 @@ class Pix2StructVisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Pix2StructVisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=Pix2StructVisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=Pix2StructVisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -309,7 +309,7 @@ class Pix2StructTextModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Pix2StructTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Pix2StructTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Pix2StructTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/prompt_depth_anything/test_modeling_prompt_depth_anything.py b/tests/models/prompt_depth_anything/test_modeling_prompt_depth_anything.py index 463e591dbeac..8f03e30cce33 100644 --- a/tests/models/prompt_depth_anything/test_modeling_prompt_depth_anything.py +++ b/tests/models/prompt_depth_anything/test_modeling_prompt_depth_anything.py @@ -149,7 +149,7 @@ def setUp(self): self, config_class=PromptDepthAnythingConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, common_properties=["patch_size"], ) diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py index 01044f3f70b9..20ea821126fb 100644 --- a/tests/models/reformer/test_modeling_reformer.py +++ b/tests/models/reformer/test_modeling_reformer.py @@ -327,9 +327,9 @@ def create_and_check_reformer_layer_dropout_seed( def create_and_check_reformer_feed_backward_chunking(self, config, input_ids, input_mask, choice_labels): # disable dropout - config.hidden_dropout_prob = 0 - config.local_attention_probs_dropout_prob = 0 - config.lsh_attention_probs_dropout_prob = 0 + config.hidden_dropout_prob = 0.0 + config.local_attention_probs_dropout_prob = 0.0 + config.lsh_attention_probs_dropout_prob = 0.0 config.lsh_num_chunks_after = 1 config.is_decoder = False @@ -608,7 +608,7 @@ class ReformerLocalAttnModelTest(ReformerTesterMixin, GenerationTesterMixin, Mod def setUp(self): self.model_tester = ReformerModelTester(self, text_seq_length=16) - self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=32) @slow def test_model_from_pretrained(self): @@ -815,7 +815,7 @@ def setUp(self): hash_seed=0, num_labels=2, ) - self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=32) def _check_attentions_for_generate( self, batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py index 952d9cf7ab7d..a2141915e6f6 100644 --- a/tests/models/rembert/test_modeling_rembert.py +++ b/tests/models/rembert/test_modeling_rembert.py @@ -371,7 +371,7 @@ class RemBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) def setUp(self): self.model_tester = RemBertModelTester(self) - self.config_tester = ConfigTester(self, config_class=RemBertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=RemBertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index 086e31c2b04e..ea4ea2a62845 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -396,7 +396,7 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2): def setUp(self): self.model_tester = RobertaModelTester(self) - self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py index 97ec0568ad3c..6945b26d65f2 100644 --- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py +++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py @@ -397,7 +397,7 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2): # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.setUp with Roberta->RobertaPreLayerNorm def setUp(self): self.model_tester = RobertaPreLayerNormModelTester(self) - self.config_tester = ConfigTester(self, config_class=RobertaPreLayerNormConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=RobertaPreLayerNormConfig, hidden_size=32) # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_config def test_config(self): diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py index 326e027ac381..2afa66651eef 100644 --- a/tests/models/roc_bert/test_modeling_roc_bert.py +++ b/tests/models/roc_bert/test_modeling_roc_bert.py @@ -616,7 +616,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = RoCBertModelTester(self) - self.config_tester = ConfigTester(self, config_class=RoCBertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=RoCBertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py index a8b60e14617a..3750691ccd19 100644 --- a/tests/models/roformer/test_modeling_roformer.py +++ b/tests/models/roformer/test_modeling_roformer.py @@ -395,7 +395,7 @@ class RoFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = RoFormerModelTester(self) - self.config_tester = ConfigTester(self, config_class=RoFormerConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=RoFormerConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py index 75998c11f168..ee7287d0715c 100644 --- a/tests/models/sew/test_modeling_sew.py +++ b/tests/models/sew/test_modeling_sew.py @@ -285,7 +285,7 @@ class SEWModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = SEWModelTester(self) - self.config_tester = ConfigTester(self, config_class=SEWConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=SEWConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py index b0c0853a7d0a..15edfe849fec 100644 --- a/tests/models/sew_d/test_modeling_sew_d.py +++ b/tests/models/sew_d/test_modeling_sew_d.py @@ -306,7 +306,7 @@ class SEWDModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = SEWDModelTester(self) - self.config_tester = ConfigTester(self, config_class=SEWDConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=SEWDConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py index ef5d69b95dd5..bb21dc021c52 100644 --- a/tests/models/siglip/test_modeling_siglip.py +++ b/tests/models/siglip/test_modeling_siglip.py @@ -187,7 +187,7 @@ class SiglipVisionModelTest(SiglipModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = SiglipVisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=SiglipVisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=SiglipVisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -350,7 +350,7 @@ class SiglipTextModelTest(SiglipModelTesterMixin, unittest.TestCase): # Copied from tests.models.clip.test_modeling_clip.CLIPTextModelTest.setUp with CLIP->Siglip def setUp(self): self.model_tester = SiglipTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=SiglipTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=SiglipTextConfig, hidden_size=32) # Copied from tests.models.clip.test_modeling_clip.CLIPTextModelTest.test_config def test_config(self): diff --git a/tests/models/siglip2/test_modeling_siglip2.py b/tests/models/siglip2/test_modeling_siglip2.py index 469b66194839..f7e482e40d35 100644 --- a/tests/models/siglip2/test_modeling_siglip2.py +++ b/tests/models/siglip2/test_modeling_siglip2.py @@ -280,7 +280,7 @@ class Siglip2VisionModelTest(Siglip2ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Siglip2VisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=Siglip2VisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=Siglip2VisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -422,7 +422,7 @@ class Siglip2TextModelTest(Siglip2ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Siglip2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Siglip2TextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Siglip2TextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index 2715f7118bd8..60ea49def7d8 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -164,7 +164,7 @@ class SpeechT5ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = SpeechT5ModelTester(self) - self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -341,7 +341,7 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase, Generatio def setUp(self): self.model_tester = SpeechT5ForSpeechToTextTester(self) - self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -824,7 +824,7 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = SpeechT5ForTextToSpeechTester(self) - self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -1317,7 +1317,7 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = SpeechT5ForSpeechToSpeechTester(self) - self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py index 215ca030b23a..eb63fd9fde4c 100644 --- a/tests/models/splinter/test_modeling_splinter.py +++ b/tests/models/splinter/test_modeling_splinter.py @@ -270,7 +270,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = SplinterModelTester(self) - self.config_tester = ConfigTester(self, config_class=SplinterConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=SplinterConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/superpoint/test_modeling_superpoint.py b/tests/models/superpoint/test_modeling_superpoint.py index 8bbdcad22659..d6bac174e360 100644 --- a/tests/models/superpoint/test_modeling_superpoint.py +++ b/tests/models/superpoint/test_modeling_superpoint.py @@ -125,7 +125,7 @@ def setUp(self): self, config_class=SuperPointConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, common_properties=["encoder_hidden_sizes", "decoder_hidden_size"], ) diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py index 39d74aa68238..c1ac34da4cf4 100644 --- a/tests/models/swiftformer/test_modeling_swiftformer.py +++ b/tests/models/swiftformer/test_modeling_swiftformer.py @@ -150,7 +150,7 @@ def setUp(self): self, config_class=SwiftFormerConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, num_attention_heads=12, num_hidden_layers=12, ) diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py index 4aa37a9b9082..65162e94b6fd 100644 --- a/tests/models/switch_transformers/test_modeling_switch_transformers.py +++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py @@ -815,8 +815,9 @@ class SwitchTransformerRouterTest(unittest.TestCase): num_experts=2, hidden_size=8, d_ff=16, - router_jitter_noise=0, + router_jitter_noise=0.0, expert_capacity=4, + num_heads=4, ) def test_equivalency_balancy_loss(self): diff --git a/tests/models/t5gemma/test_modeling_t5gemma.py b/tests/models/t5gemma/test_modeling_t5gemma.py index 5e4adaeca435..70380f288a35 100644 --- a/tests/models/t5gemma/test_modeling_t5gemma.py +++ b/tests/models/t5gemma/test_modeling_t5gemma.py @@ -607,7 +607,7 @@ def setUp(self): self, config_class=T5GemmaConfig, # For faking the testing. - hidden_size=37, + hidden_size=32, vocab_size=self.model_tester.vocab_size, num_attention_heads=self.model_tester.num_attention_heads, num_hidden_layers=self.model_tester.num_hidden_layers, @@ -1476,7 +1476,7 @@ def setUp(self): self, config_class=T5GemmaConfig, # For faking the testing. - hidden_size=37, + hidden_size=32, vocab_size=self.model_tester.vocab_size, num_attention_heads=self.model_tester.num_attention_heads, num_hidden_layers=self.model_tester.num_hidden_layers, diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py index a0d5a8a0720c..2402cbe4e16b 100644 --- a/tests/models/timesformer/test_modeling_timesformer.py +++ b/tests/models/timesformer/test_modeling_timesformer.py @@ -168,7 +168,7 @@ class TimesformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC def setUp(self): self.model_tester = TimesformerModelTester(self) self.config_tester = ConfigTester( - self, config_class=TimesformerConfig, has_text_modality=False, hidden_size=37 + self, config_class=TimesformerConfig, has_text_modality=False, hidden_size=32 ) def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/tvp/test_modeling_tvp.py b/tests/models/tvp/test_modeling_tvp.py index bc8f417f259a..9d804db3319d 100644 --- a/tests/models/tvp/test_modeling_tvp.py +++ b/tests/models/tvp/test_modeling_tvp.py @@ -218,7 +218,7 @@ def _validate_backbone_init(): # Load a timm backbone # We hack adding hidden_sizes to the config to test the backbone loading - backbone_config = TimmBackboneConfig("resnet18", out_indices=[-2, -1], hidden_sizes=[64, 128]) + backbone_config = TimmBackboneConfig(backbone="resnet18", out_indices=[-2, -1], hidden_sizes=[64, 128]) config_dict["backbone_config"] = backbone_config config = config.__class__(**config_dict) _validate_backbone_init() diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py index 9a086557340f..4d098822402a 100644 --- a/tests/models/udop/test_modeling_udop.py +++ b/tests/models/udop/test_modeling_udop.py @@ -283,7 +283,7 @@ class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin def setUp(self): self.model_tester = UdopModelTester(self) - self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=37) + self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=32) def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = copy.deepcopy(inputs_dict) @@ -549,7 +549,7 @@ class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = UdopEncoderOnlyModelTester(self) - self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=37) + self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py index d0490cd4900b..2a2678ebc56b 100644 --- a/tests/models/unispeech/test_modeling_unispeech.py +++ b/tests/models/unispeech/test_modeling_unispeech.py @@ -316,7 +316,7 @@ def setUp(self): self.model_tester = UniSpeechModelTester( self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True ) - self.config_tester = ConfigTester(self, config_class=UniSpeechConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=UniSpeechConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py index 8d4bd2b3aa41..7ea3bf51cebf 100644 --- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py +++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py @@ -365,7 +365,7 @@ class UniSpeechSatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test def setUp(self): self.model_tester = UniSpeechSatModelTester(self) - self.config_tester = ConfigTester(self, config_class=UniSpeechSatConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=UniSpeechSatConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -539,7 +539,7 @@ def setUp(self): self.model_tester = UniSpeechSatModelTester( self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True ) - self.config_tester = ConfigTester(self, config_class=UniSpeechSatConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=UniSpeechSatConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py index c1c90c938758..ce0f168d3f5f 100644 --- a/tests/models/upernet/test_modeling_upernet.py +++ b/tests/models/upernet/test_modeling_upernet.py @@ -158,7 +158,7 @@ def setUp(self): self, config_class=UperNetConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, common_properties=["hidden_size"], ) diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py index cb71bf0060c4..10619a01ac4b 100644 --- a/tests/models/videomae/test_modeling_videomae.py +++ b/tests/models/videomae/test_modeling_videomae.py @@ -199,7 +199,7 @@ class VideoMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = VideoMAEModelTester(self) - self.config_tester = ConfigTester(self, config_class=VideoMAEConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=VideoMAEConfig, has_text_modality=False, hidden_size=32) def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = copy.deepcopy(inputs_dict) diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py index 707ebea84a1e..516f3644e34b 100644 --- a/tests/models/vilt/test_modeling_vilt.py +++ b/tests/models/vilt/test_modeling_vilt.py @@ -251,7 +251,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = ViltModelTester(self) - self.config_tester = ConfigTester(self, config_class=ViltConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ViltConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -530,7 +530,7 @@ class ViltForImagesAndTextClassificationModelTest(ViltModelTest, unittest.TestCa def setUp(self): self.model_tester = ViltModelTester(self, modality_type_vocab_size=3, add_multiple_images=True, num_images=2) - self.config_tester = ConfigTester(self, config_class=ViltConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ViltConfig, hidden_size=32) @unittest.skip(reason="We only test the model that takes in multiple images") def test_model(self): diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py index 0fa3ad4b461b..b945c301f870 100644 --- a/tests/models/visual_bert/test_modeling_visual_bert.py +++ b/tests/models/visual_bert/test_modeling_visual_bert.py @@ -391,7 +391,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = VisualBertModelTester(self) - self.config_tester = ConfigTester(self, config_class=VisualBertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=VisualBertConfig, hidden_size=32) def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/vit/test_modeling_vit.py b/tests/models/vit/test_modeling_vit.py index 55340f956c5d..b9c63332d3e0 100644 --- a/tests/models/vit/test_modeling_vit.py +++ b/tests/models/vit/test_modeling_vit.py @@ -207,7 +207,7 @@ class ViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = ViTModelTester(self) - self.config_tester = ConfigTester(self, config_class=ViTConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ViTConfig, has_text_modality=False, hidden_size=32) @unittest.skip( "Since `torch==2.3+cu121`, although this test passes, many subsequent tests have `CUDA error: misaligned address`." diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py index 76b3c564c545..282929d54204 100644 --- a/tests/models/vit_mae/test_modeling_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_vit_mae.py @@ -183,7 +183,7 @@ class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = ViTMAEModelTester(self) - self.config_tester = ConfigTester(self, config_class=ViTMAEConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ViTMAEConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/vit_msn/test_modeling_vit_msn.py b/tests/models/vit_msn/test_modeling_vit_msn.py index 4a42efb3783c..17e3c677b538 100644 --- a/tests/models/vit_msn/test_modeling_vit_msn.py +++ b/tests/models/vit_msn/test_modeling_vit_msn.py @@ -165,7 +165,7 @@ class ViTMSNModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = ViTMSNModelTester(self) - self.config_tester = ConfigTester(self, config_class=ViTMSNConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ViTMSNConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/vitdet/test_modeling_vitdet.py b/tests/models/vitdet/test_modeling_vitdet.py index fe7261cbb6ac..73f6cf11be85 100644 --- a/tests/models/vitdet/test_modeling_vitdet.py +++ b/tests/models/vitdet/test_modeling_vitdet.py @@ -168,7 +168,7 @@ class VitDetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = VitDetModelTester(self) - self.config_tester = ConfigTester(self, config_class=VitDetConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=VitDetConfig, has_text_modality=False, hidden_size=32) # TODO: Fix me (once this model gets more usage) @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.") diff --git a/tests/models/vitmatte/test_modeling_vitmatte.py b/tests/models/vitmatte/test_modeling_vitmatte.py index b7d843cfca1a..f9fc05db94fe 100644 --- a/tests/models/vitmatte/test_modeling_vitmatte.py +++ b/tests/models/vitmatte/test_modeling_vitmatte.py @@ -146,7 +146,7 @@ def setUp(self): self, config_class=VitMatteConfig, has_text_modality=False, - hidden_size=37, + hidden_size=32, common_properties=["hidden_size"], ) diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py index 0bd01698f9aa..1c1b7a5dda7b 100644 --- a/tests/models/vitpose/test_modeling_vitpose.py +++ b/tests/models/vitpose/test_modeling_vitpose.py @@ -156,7 +156,7 @@ class VitPoseModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = VitPoseModelTester(self) - self.config_tester = ConfigTester(self, config_class=VitPoseConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=VitPoseConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.create_and_test_config_to_json_string() diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py index a02cdaf06ab7..187d6737aaf0 100644 --- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py +++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py @@ -126,7 +126,7 @@ class VitPoseBackboneModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = VitPoseBackboneModelTester(self) self.config_tester = ConfigTester( - self, config_class=VitPoseBackboneConfig, has_text_modality=False, hidden_size=37 + self, config_class=VitPoseBackboneConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): diff --git a/tests/models/vits/test_modeling_vits.py b/tests/models/vits/test_modeling_vits.py index e799a7ca20af..bfa7cef969f5 100644 --- a/tests/models/vits/test_modeling_vits.py +++ b/tests/models/vits/test_modeling_vits.py @@ -167,7 +167,7 @@ class VitsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = VitsModelTester(self) - self.config_tester = ConfigTester(self, config_class=VitsConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=VitsConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py index 69d89c571d00..cb9d1d3d043d 100644 --- a/tests/models/vivit/test_modeling_vivit.py +++ b/tests/models/vivit/test_modeling_vivit.py @@ -175,7 +175,7 @@ class VivitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = VivitModelTester(self) - self.config_tester = ConfigTester(self, config_class=VivitConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=VivitConfig, has_text_modality=False, hidden_size=32) def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = copy.deepcopy(inputs_dict) diff --git a/tests/models/vjepa2/test_modeling_vjepa2.py b/tests/models/vjepa2/test_modeling_vjepa2.py index 054115fa5bf8..9cb0280dec51 100644 --- a/tests/models/vjepa2/test_modeling_vjepa2.py +++ b/tests/models/vjepa2/test_modeling_vjepa2.py @@ -62,7 +62,7 @@ def __init__( num_hidden_layers=2, num_attention_heads=2, num_frames=2, - mlp_ratio=1, + mlp_ratio=1.0, pred_hidden_size=32, pred_num_attention_heads=2, pred_num_hidden_layers=2, @@ -158,7 +158,7 @@ class VJEPA2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = VJEPA2ModelTester(self) - self.config_tester = ConfigTester(self, config_class=VJEPA2Config, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=VJEPA2Config, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index 85ec0cc6d956..1d1fddad6051 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -493,7 +493,7 @@ class Wav2Vec2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = Wav2Vec2ModelTester(self) - self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() @@ -687,7 +687,7 @@ def setUp(self): self.model_tester = Wav2Vec2ModelTester( self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True ) - self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py index 71b24e406524..75c8e1bd9795 100644 --- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py +++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py @@ -421,7 +421,7 @@ class Wav2Vec2BertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test def setUp(self): self.model_tester = Wav2Vec2BertModelTester(self) - self.config_tester = ConfigTester(self, config_class=Wav2Vec2BertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Wav2Vec2BertConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py index 416a6d3cb537..c2c3bddb54eb 100644 --- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py +++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py @@ -412,7 +412,7 @@ class Wav2Vec2ConformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest def setUp(self): self.model_tester = Wav2Vec2ConformerModelTester(self) - self.config_tester = ConfigTester(self, config_class=Wav2Vec2ConformerConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Wav2Vec2ConformerConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py index 247c2b3fe5d2..5d3ac85a8737 100644 --- a/tests/models/wavlm/test_modeling_wavlm.py +++ b/tests/models/wavlm/test_modeling_wavlm.py @@ -308,7 +308,7 @@ class WavLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = WavLMModelTester(self) - self.config_tester = ConfigTester(self, config_class=WavLMConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=WavLMConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 5b4aac347e79..2811c80a127b 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -153,7 +153,7 @@ class XCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = XCLIPVisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=XCLIPVisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=XCLIPVisionConfig, has_text_modality=False, hidden_size=32 ) def test_config(self): @@ -395,7 +395,7 @@ class XCLIPTextModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = XCLIPTextModelTester(self) - self.config_tester = ConfigTester(self, config_class=XCLIPTextConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=XCLIPTextConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py index bb4ba64decc7..0e854de03ff6 100644 --- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py +++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py @@ -408,7 +408,7 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2): def setUp(self): self.model_tester = XLMRobertaXLModelTester(self) - self.config_tester = ConfigTester(self, config_class=XLMRobertaXLConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=XLMRobertaXLConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/xmod/test_modeling_xmod.py b/tests/models/xmod/test_modeling_xmod.py index d5c5ac4bc0ef..c74dbbcef12c 100644 --- a/tests/models/xmod/test_modeling_xmod.py +++ b/tests/models/xmod/test_modeling_xmod.py @@ -405,7 +405,7 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2): def setUp(self): self.model_tester = XmodModelTester(self) - self.config_tester = ConfigTester(self, config_class=XmodConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=XmodConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py index 2fc452c8bb27..bb6e3a8264e3 100644 --- a/tests/models/yolos/test_modeling_yolos.py +++ b/tests/models/yolos/test_modeling_yolos.py @@ -198,7 +198,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = YolosModelTester(self) - self.config_tester = ConfigTester(self, config_class=YolosConfig, has_text_modality=False, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=YolosConfig, has_text_modality=False, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py index 8f32cc97adab..9e341ac49d34 100644 --- a/tests/models/yoso/test_modeling_yoso.py +++ b/tests/models/yoso/test_modeling_yoso.py @@ -274,7 +274,7 @@ class YosoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = YosoModelTester(self) - self.config_tester = ConfigTester(self, config_class=YosoConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=YosoConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py index 4575894dcb3b..d0be768d2cde 100644 --- a/tests/models/zamba/test_modeling_zamba.py +++ b/tests/models/zamba/test_modeling_zamba.py @@ -339,7 +339,7 @@ def _check_caches_are_equal(self, cache1: ZambaHybridDynamicCache, cache2: Zamba def setUp(self): self.model_tester = ZambaModelTester(self) - self.config_tester = ConfigTester(self, config_class=ZambaConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=ZambaConfig, hidden_size=32) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/zamba2/test_modeling_zamba2.py b/tests/models/zamba2/test_modeling_zamba2.py index f80cefdba3bf..5ec05bbf368d 100644 --- a/tests/models/zamba2/test_modeling_zamba2.py +++ b/tests/models/zamba2/test_modeling_zamba2.py @@ -353,7 +353,7 @@ def _check_caches_are_equal(self, cache1: Zamba2HybridDynamicCache, cache2: Zamb def setUp(self): self.model_tester = Zamba2ModelTester(self) - self.config_tester = ConfigTester(self, config_class=Zamba2Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=Zamba2Config, hidden_size=32) @unittest.skip("We need at leat 3 layers to test weight tying!") def test_num_layers_is_small(self): diff --git a/tests/models/zoedepth/test_modeling_zoedepth.py b/tests/models/zoedepth/test_modeling_zoedepth.py index 3165b3b0b191..d670a4751d07 100644 --- a/tests/models/zoedepth/test_modeling_zoedepth.py +++ b/tests/models/zoedepth/test_modeling_zoedepth.py @@ -148,7 +148,7 @@ class ZoeDepthModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = ZoeDepthModelTester(self) self.config_tester = ConfigTester( - self, config_class=ZoeDepthConfig, has_text_modality=False, hidden_size=37, common_properties=[] + self, config_class=ZoeDepthConfig, has_text_modality=False, hidden_size=32, common_properties=[] ) def test_config(self): diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py index 69c42340a5eb..81663d59e3eb 100644 --- a/tests/test_configuration_common.py +++ b/tests/test_configuration_common.py @@ -52,30 +52,6 @@ def create_and_test_config_common_properties(self): for prop in common_properties: self.parent.assertTrue(hasattr(config, prop), msg=f"`{prop}` does not exist") - # Test that config has the common properties as setter - for idx, name in enumerate(common_properties): - try: - setattr(config, name, idx) - self.parent.assertEqual( - getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}" - ) - except NotImplementedError: - # Some models might not be able to implement setters for common_properties - # In that case, a NotImplementedError is raised - pass - - # Test if config class can be called with Config(prop_name=..) - for idx, name in enumerate(common_properties): - try: - config = self.config_class(**{name: idx}) - self.parent.assertEqual( - getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}" - ) - except NotImplementedError: - # Some models might not be able to implement setters for common_properties - # In that case, a NotImplementedError is raised - pass - def create_and_test_config_to_json_string(self): config = self.config_class(**self.inputs_dict) obj = json.loads(config.to_json_string()) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 5503978fcacf..adf62c59dc2e 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -630,7 +630,13 @@ def _test_eager_matches_batched_and_grouped_inference(self, name, dtype): def _config_zero_init(config): configs_no_init = copy.deepcopy(config) for key in configs_no_init.__dict__: - if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key: + if ( + "init_range" in key + or "initializer_range" in key + or "_std" in key + or "initializer_factor" in key + or ("layer_scale" in key and key != "use_layer_scale") + ): setattr(configs_no_init, key, 1e-10) if isinstance(getattr(configs_no_init, key, None), PreTrainedConfig): no_init_subconfig = _config_zero_init(getattr(configs_no_init, key)) @@ -4248,9 +4254,9 @@ def _prepare_config_headdim(config, requested_dim): def update_config_headdim(config, requested_dim): # Flex Attention cannot use dropout if hasattr(config, "attention_dropout"): - config.attention_dropout = 0 + config.attention_dropout = 0.0 if hasattr(config, "attention_probs_dropout_prob"): - config.attention_probs_dropout_prob = 0 + config.attention_probs_dropout_prob = 0.0 # Update the head dim and try to update hidden size as well if present in config # NOTE: some models may have none if the values in sub-config, thus we check for `Noneness` diff --git a/tests/utils/test_configuration_utils.py b/tests/utils/test_configuration_utils.py index b27a8e4a04d9..a6d91e9ac93c 100644 --- a/tests/utils/test_configuration_utils.py +++ b/tests/utils/test_configuration_utils.py @@ -144,13 +144,13 @@ def test_config_common_kwargs_is_complete(self): self.assertListEqual( missing_keys, [ - "_output_attentions", "is_encoder_decoder", + "tokenizer_class", "_name_or_path", "_commit_hash", + "_output_attentions", "_attn_implementation_internal", "_experts_implementation_internal", - "transformers_version", ], ) keys_with_defaults = [key for key, value in config_common_kwargs.items() if value == getattr(base_config, key)] @@ -282,16 +282,16 @@ def test_get_text_config(self): config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-bart") self.assertEqual(config.get_text_config(), config) # both encoder_layers and decoder_layers exist - self.assertTrue(getattr(config, "encoder_layers", None) is not None) - self.assertTrue(getattr(config, "decoder_layers", None) is not None) + self.assertTrue(getattr(config, "encoder_ffn_dim", None) is not None) + self.assertTrue(getattr(config, "decoder_ffn_dim", None) is not None) + decoder_config = config.get_text_config(decoder=True) self.assertNotEqual(decoder_config, config) self.assertEqual(decoder_config.num_hidden_layers, config.decoder_layers) - self.assertTrue(getattr(decoder_config, "encoder_layers", None) is None) # encoder_layers is removed + encoder_config = config.get_text_config(encoder=True) self.assertNotEqual(encoder_config, config) self.assertEqual(encoder_config.num_hidden_layers, config.encoder_layers) - self.assertTrue(getattr(encoder_config, "decoder_layers", None) is None) # decoder_layers is removed @require_torch def test_bc_torch_dtype(self): diff --git a/tests/utils/test_generic.py b/tests/utils/test_generic.py index 73d8fec05f23..50acbb200ff2 100644 --- a/tests/utils/test_generic.py +++ b/tests/utils/test_generic.py @@ -284,7 +284,7 @@ def test_decorator_eager(self): ) if config_return_dict is None and return_dict is None: expected_type = tuple - message = f"output should be a {expected_type.__name__} when config.use_return_dict={config_return_dict} and return_dict={return_dict}" + message = f"output should be a {expected_type.__name__} when config.return_dict={config_return_dict} and return_dict={return_dict}" self.assertIsInstance(output, expected_type, message) @pytest.mark.torch_compile_test diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py index 04de261cd6c5..1699a1ad74ba 100644 --- a/tests/utils/test_modeling_rope_utils.py +++ b/tests/utils/test_modeling_rope_utils.py @@ -63,14 +63,15 @@ def test_rope_validation(self): # Any other parameters passed to RoPE will raise a warning that a particular key is not used # But sometimes we can have model-specific RoPE kwargs and bypass warning with `ignore_keys` - model_specific_kwarg = "mrope_sections" # e,g in Qwen2-VL + config.ignore_keys_at_rope_validation = {"mrope_sections"} # e,g in Qwen2-VL + config.rope_parameters = {"rope_type": "default", "rope_theta": 10000.0, "mrope_sections": True} + config.validate_rope() - config.rope_parameters = {"rope_type": "default", "rope_theta": 10000.0, model_specific_kwarg: True} - config.validate_rope(ignore_keys={model_specific_kwarg}) with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: + config.ignore_keys_at_rope_validation = set() config.validate_rope() self.assertEqual(len(logs.output), 1) - self.assertIn(model_specific_kwarg, logs.output[0]) + self.assertIn("mrope_sections", logs.output[0]) # We can indicate Different RoPE params for each attention type # We can also have only one RoPE params defined for all layer, we don't raise an error diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 3b3be837e471..64ef814f0b48 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -33,6 +33,7 @@ # Usually of small list of allowed attrs, but can be True to allow all SPECIAL_CASES_TO_ALLOW = { "PI0Config": ["vlm_projection_dim"], + "EuroBertConfig": ["is_causal"], # not used directly, allows causal-bidirectional switch "Ernie4_5_VL_MoeConfig": ["args"], # BC Alias "Ernie4_5_VL_MoeTextConfig": ["args"], # BC Alias "Ernie4_5_VL_MoeVisionConfig": ["args"], # BC Alias @@ -91,6 +92,8 @@ "HiggsAudioV2TokenizerConfig": ["downsample_factor"], "CsmConfig": ["tie_codebooks_embeddings"], "DeepseekV2Config": ["norm_topk_prob"], + "EsmFoldConfig": ["esm_ablate_pairwise", "esm_ablate_sequence", "esm_input_dropout", "esm_type"], + "TrunkConfig": ["cpu_grad_checkpoint", "layer_drop"], "SeamlessM4TConfig": True, "SeamlessM4Tv2Config": True, "ConditionalDetrConfig": True, @@ -139,6 +142,16 @@ # Common and important attributes, even if they do not always appear in the modeling files (can be a regex pattern) ATTRIBUTES_TO_ALLOW = ( + # Attr in base `PreTrainedConfig` + "chunk_size_feed_forward", + "dtype", + "id2label", + "label2id", + "problem_type", + "tokenizer_class", + "is_encoder_decoder", + "output_hidden_states", + "return_dict", # Inits related "initializer_range", "init_std", diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index f39f6c046d15..f855d4c1392a 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -975,6 +975,9 @@ def replace_class_node( modular_class_attributes = {} for node in modular_class_node.body.body: if m.matches(node, m.SimpleStatementLine(body=[m.Assign()])): + if hasattr(node.body[0].value, "func") and node.body[0].value.func.value == "AttributeError": + original_modeling_class_attributes.pop(node.body[0].targets[0].target.value) + continue # delete unnecessary cls attribute, especially in configs modular_class_attributes[node.body[0].targets[0].target.value] = node elif m.matches(node, m.SimpleStatementLine(body=[m.AnnAssign()])): modular_class_attributes[node.body[0].target.value] = node