-
Notifications
You must be signed in to change notification settings - Fork 32.9k
Fix vllm cis #45139
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix vllm cis #45139
Changes from all commits
f36ea56
013f76a
7a6f698
04fb689
1b53a3f
4895e1f
b3febee
411eb60
55809c8
01cdc60
0d8628b
d3484b7
60fd4a1
98ae737
814c3a8
0889375
8ea59cb
1852250
9a77627
db0529e
1d81931
a104d58
30f6d73
12bf667
907402a
f924677
4cd3b43
ae4860e
c8985c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -143,7 +143,7 @@ def _compute_linear_scaling_rope_parameters( | |
| The model configuration. This function assumes that the config will provide at least the following | ||
| properties: | ||
|
|
||
| * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. | ||
| * rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted. | ||
| * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. | ||
| * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. | ||
|
|
||
|
|
@@ -199,7 +199,7 @@ def _compute_proportional_rope_parameters( | |
| The model configuration. This function assumes that the config will provide at least the following | ||
| properties: | ||
|
|
||
| * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. | ||
| * rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted. | ||
| * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. | ||
| * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. | ||
|
|
||
|
|
@@ -268,7 +268,7 @@ def _compute_dynamic_ntk_parameters( | |
| The model configuration. This function assumes that the config will provide at least the following | ||
| properties: | ||
|
|
||
| * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. | ||
| * rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted. | ||
| * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. | ||
| * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. | ||
| * max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at | ||
|
|
@@ -339,7 +339,7 @@ def _compute_yarn_parameters( | |
| The model configuration. This function assumes that the config will provide at least the following | ||
| properties: | ||
|
|
||
| * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. | ||
| * rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted. | ||
| * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. | ||
| * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. | ||
| * max_position_embeddings (`int`): The maximum length of the positional embeddings. | ||
|
|
@@ -474,7 +474,7 @@ def _compute_longrope_parameters( | |
| The model configuration. This function assumes that the config will provide at least the following | ||
| properties: | ||
|
|
||
| * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. | ||
| * rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted. | ||
| * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. | ||
| * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. | ||
| * max_position_embeddings (`int`): The maximum length of the positional embeddings. | ||
|
|
@@ -561,7 +561,7 @@ def _compute_llama3_parameters( | |
| The model configuration. This function assumes that the config will provide at least the following | ||
| properties: | ||
|
|
||
| * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. | ||
| * rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted. | ||
| * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. | ||
| * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. | ||
| * rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following | ||
|
|
@@ -642,8 +642,9 @@ def _compute_llama3_parameters( | |
| class RopeParameters(TypedDict): | ||
| """ | ||
| Args: | ||
| rope_theta (`float`): | ||
| The base period of the RoPE embeddings. | ||
| rope_theta (`float`, *optional*, defaults to `RotaryEmbeddingConfigMixin.default_theta`): | ||
| The base period of the RoPE embeddings. Optional in serialized configs — if omitted, | ||
| the model's `default_theta` (typically 10000.0) is used. | ||
| rope_type (`str`, *optional*, defaults to "default"): | ||
| The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', | ||
| 'llama3'], with 'default' being the original RoPE implementation. | ||
|
|
@@ -680,7 +681,7 @@ class RopeParameters(TypedDict): | |
| Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE | ||
| """ | ||
|
|
||
| rope_theta: float | ||
| rope_theta: float | None | ||
| rope_type: str | None | ||
| partial_rotary_factor: float | None | ||
| factor: float | None | ||
|
|
@@ -801,34 +802,44 @@ def validate_rope(self: "PreTrainedConfig"): | |
| ) | ||
|
|
||
| def _validate_default_rope_parameters(self, rope_parameters: dict, ignore_keys: set | None = None): | ||
| required_keys = {"rope_type", "rope_theta"} | ||
| required_keys = {"rope_type"} | ||
| optional_keys = {"rope_theta"} | ||
|
Comment on lines
+805
to
+806
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep, this kinda defeats the point of validation because a RoPE dict with no theta isn't valid for our modules
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah but we always default to default_theta if its not there no?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. validation always happens after teh defaults are set, so ideally it shouldn't raise an error. Do we know why the theta was missing? |
||
| received_keys = set(rope_parameters.keys()) | ||
| rope_type = rope_parameters["rope_type"] | ||
| self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) | ||
| self._check_received_keys( | ||
| rope_type, received_keys, required_keys, optional_keys=optional_keys, ignore_keys=ignore_keys | ||
| ) | ||
|
|
||
| def _validate_linear_rope_parameters(self, rope_parameters: dict, ignore_keys: set | None = None): | ||
| required_keys = {"rope_type", "factor", "rope_theta"} | ||
| required_keys = {"rope_type", "factor"} | ||
| optional_keys = {"rope_theta"} | ||
| received_keys = set(rope_parameters.keys()) | ||
| rope_type = rope_parameters["rope_type"] | ||
| self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) | ||
| self._check_received_keys( | ||
| rope_type, received_keys, required_keys, optional_keys=optional_keys, ignore_keys=ignore_keys | ||
| ) | ||
|
|
||
| factor = rope_parameters["factor"] | ||
| if factor is None or not isinstance(factor, float) or factor < 1.0: | ||
| logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") | ||
|
|
||
| def _validate_dynamic_rope_parameters(self, rope_parameters: dict, ignore_keys: set | None = None): | ||
| required_keys = {"rope_type", "factor", "rope_theta"} | ||
| required_keys = {"rope_type", "factor"} | ||
| optional_keys = {"rope_theta"} | ||
| received_keys = set(rope_parameters.keys()) | ||
| rope_type = rope_parameters["rope_type"] | ||
| self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) | ||
| self._check_received_keys( | ||
| rope_type, received_keys, required_keys, optional_keys=optional_keys, ignore_keys=ignore_keys | ||
| ) | ||
|
|
||
| factor = rope_parameters["factor"] | ||
| if factor is None or not isinstance(factor, float) or factor < 1.0: | ||
| logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") | ||
|
|
||
| def _validate_yarn_rope_parameters(self, rope_parameters: dict, ignore_keys: set | None = None): | ||
| required_keys = {"rope_type", "factor", "rope_theta", "original_max_position_embeddings"} | ||
| required_keys = {"rope_type", "factor", "original_max_position_embeddings"} | ||
| optional_keys = { | ||
| "rope_theta", | ||
| "attention_factor", | ||
| "beta_fast", | ||
| "beta_slow", | ||
|
|
@@ -878,8 +889,8 @@ def _validate_yarn_rope_parameters(self, rope_parameters: dict, ignore_keys: set | |
| ) | ||
|
|
||
| def _validate_longrope_rope_parameters(self, rope_parameters: dict, ignore_keys: set | None = None): | ||
| required_keys = {"rope_type", "short_factor", "long_factor", "rope_theta", "original_max_position_embeddings"} | ||
| optional_keys = {"attention_factor", "factor"} | ||
| required_keys = {"rope_type", "short_factor", "long_factor", "original_max_position_embeddings"} | ||
| optional_keys = {"rope_theta", "attention_factor", "factor"} | ||
| received_keys = set(rope_parameters.keys()) | ||
| rope_type = rope_parameters["rope_type"] | ||
| self._check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3618,7 +3618,10 @@ def get_init_context( | |
| elif is_quantized: | ||
| init_contexts.extend([torch.device("meta"), set_quantized_state()]) | ||
| else: | ||
| init_contexts.append(torch.device("meta")) | ||
| # meta_device_safe_creation_ops patches torch.linspace to default to CPU | ||
| # so that custom models calling .item() during __init__ (e.g. drop-path | ||
| # schedules) don't crash on meta tensors. | ||
| init_contexts.extend([torch.device("meta"), init.meta_device_safe_creation_ops()]) | ||
|
|
||
| return init_contexts | ||
|
|
||
|
|
@@ -4612,7 +4615,7 @@ def mark_tied_weights_as_initialized(self, loading_info): | |
| later as they will be tied (overwritten) anyway. | ||
| This is very important as most embeddings are tied, and they are huge params (vocabularies are often 256k), so | ||
| running inits on them is very costly.""" | ||
| for tied_param in self.all_tied_weights_keys.keys(): | ||
| for tied_param in getattr(self, "all_tied_weights_keys", {}).keys(): | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix remote code |
||
| param = self.get_parameter(tied_param) | ||
| param._is_hf_initialized = True | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
likely needs to be done elsewhere then too? I think all need rope theta?