-
Notifications
You must be signed in to change notification settings - Fork 33.7k
Fix vllm cis #45139
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix vllm cis #45139
Changes from 1 commit
f36ea56
013f76a
7a6f698
04fb689
1b53a3f
4895e1f
b3febee
411eb60
55809c8
01cdc60
0d8628b
d3484b7
60fd4a1
98ae737
814c3a8
0889375
8ea59cb
1852250
9a77627
db0529e
1d81931
a104d58
30f6d73
12bf667
907402a
f924677
4cd3b43
ae4860e
c8985c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -555,10 +555,47 @@ def _compute_llama3_parameters( | |
| return inv_freq_llama, attention_factor | ||
|
|
||
|
|
||
| def _compute_default_rope_parameters( | ||
| config: Optional["PreTrainedConfig"] = None, | ||
| device: Optional["torch.device"] = None, | ||
| seq_len: int | None = None, | ||
| layer_type: str | None = None, | ||
| ) -> tuple["torch.Tensor", float]: | ||
| """ | ||
| Computes the inverse frequencies for the default RoPE implementation (no scaling). | ||
|
|
||
| Args: | ||
| config ([`~transformers.PreTrainedConfig`]): | ||
| The model configuration. | ||
| device (`torch.device`): | ||
| The device to use for initialization of the inverse frequencies. | ||
| seq_len (`int`, *optional*): | ||
| The current sequence length. Unused for this type of RoPE. | ||
| layer_type (`str`, *optional*): | ||
| The layer type for per-layer rope configs. | ||
|
|
||
| Returns: | ||
| Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the | ||
| post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). | ||
| """ | ||
| config.standardize_rope_params() | ||
| rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters | ||
|
|
||
| base = rope_parameters_dict.get("rope_theta", getattr(config, "rope_theta", config.default_theta)) | ||
| partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) | ||
| head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads | ||
| dim = int(head_dim * partial_rotary_factor) | ||
| attention_factor = 1.0 # Unused in this type of RoPE | ||
|
|
||
| inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)) | ||
| return inv_freq, attention_factor | ||
|
|
||
|
|
||
| # This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters | ||
| # from the model config. You can append new {'rope_type': callable} pairs to this rope_parameters to enable custom RoPE | ||
| # parameterizations, as long as the callable has the same signature. | ||
| ROPE_INIT_FUNCTIONS = { | ||
| "default": _compute_default_rope_parameters, | ||
| "linear": _compute_linear_scaling_rope_parameters, | ||
| "dynamic": _compute_dynamic_ntk_parameters, | ||
| "yarn": _compute_yarn_parameters, | ||
|
|
@@ -699,10 +736,17 @@ def standardize_rope_params(self): | |
|
|
||
| self.rope_parameters = rope_parameters | ||
|
|
||
| def validate_rope(self: "PreTrainedConfig"): | ||
| def validate_rope(self: "PreTrainedConfig", ignore_keys: set | None = None): | ||
| """ | ||
| Validate the RoPE config arguments, given a `"PreTrainedConfig"` object | ||
|
|
||
| Args: | ||
| ignore_keys (`set`, *optional*): | ||
| Keys to ignore during validation. If provided, sets `ignore_keys_at_rope_validation` on the config. | ||
| Deprecated: set `config.ignore_keys_at_rope_validation` directly instead. | ||
| """ | ||
| if ignore_keys is not None: | ||
| self.ignore_keys_at_rope_validation = self.ignore_keys_at_rope_validation | ignore_keys | ||
| # Don't validate if no rope_parameters found (`None`) or if it's an empty dict | ||
| # Note that validation runs every time a new config is created, even if config is non-RoPE | ||
| rope_parameters_dict = getattr(self, "rope_parameters", None) | ||
|
|
@@ -729,10 +773,13 @@ def validate_rope(self: "PreTrainedConfig"): | |
| ) | ||
|
|
||
| def _validate_default_rope_parameters(self, rope_parameters: dict, ignore_keys: set | None = None): | ||
| required_keys = {"rope_type", "rope_theta"} | ||
| required_keys = {"rope_type"} | ||
| optional_keys = {"rope_theta"} | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. likely needs to be done elsewhere then too? I think all need rope theta?
Comment on lines
+805
to
+806
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep, this kinda defeats the point of validation because a RoPE dict with no theta isn't valid for our modules
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah but we always default to default_theta if its not there no?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. validation always happens after teh defaults are set, so ideally it shouldn't raise an error. Do we know why the theta was missing? |
||
| received_keys = set(rope_parameters.keys()) | ||
| rope_type = rope_parameters["rope_type"] | ||
| self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) | ||
| self._check_received_keys( | ||
| rope_type, received_keys, required_keys, optional_keys=optional_keys, ignore_keys=ignore_keys | ||
| ) | ||
|
|
||
| def _validate_linear_rope_parameters(self, rope_parameters: dict, ignore_keys: set | None = None): | ||
| required_keys = {"rope_type", "factor", "rope_theta"} | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4610,7 +4610,7 @@ def mark_tied_weights_as_initialized(self, loading_info): | |
| later as they will be tied (overwritten) anyway. | ||
| This is very important as most embeddings are tied, and they are huge params (vocabularies are often 256k), so | ||
| running inits on them is very costly.""" | ||
| for tied_param in self.all_tied_weights_keys.keys(): | ||
| for tied_param in getattr(self, "all_tied_weights_keys", {}).keys(): | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix remote code |
||
| param = self.get_parameter(tied_param) | ||
| param._is_hf_initialized = True | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1277,7 +1277,9 @@ def __getattr__(self, key): | |
|
|
||
| # Named special tokens (bos_token, eos_token, etc.) | ||
| if key_without_id in self.SPECIAL_TOKENS_ATTRIBUTES: | ||
| token_value = self._special_tokens_map.get(key_without_id) | ||
| # Use __dict__.get to avoid recursive __getattr__ when _special_tokens_map | ||
| # is not yet initialized (e.g. during fast tokenizer __init__) | ||
| token_value = self.__dict__.get("_special_tokens_map", {}).get(key_without_id) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same easy fix for remote code |
||
| if token_value is None: | ||
| if self.verbose: | ||
| logger.error(f"Using {key}, but it is not set yet.") | ||
|
|
@@ -1286,7 +1288,7 @@ def __getattr__(self, key): | |
|
|
||
| # Extra special tokens | ||
| if key_without_id == "extra_special_tokens": | ||
| tokens = [str(tok) for tok in self._extra_special_tokens] | ||
| tokens = [str(tok) for tok in self.__dict__.get("_extra_special_tokens", [])] | ||
| return self.convert_tokens_to_ids(tokens) if key != key_without_id else tokens | ||
|
|
||
| if key not in self.__dict__: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remote code BC?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cc @zucchini-nlp
It is true that remote code won't have it, but we likely would also need to refactor a lot of models, seems risky; especially for models that do have a different default init so we need to check if some code exists first and then use it as fallback