-
-
Notifications
You must be signed in to change notification settings - Fork 15.8k
Revert "Various Transformers v5 fixes" (#38127) #38229
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -114,9 +114,6 @@ def __init__( | |||||||||||||||||||
| self.projector_config = MlpProjectorConfig(**projector_config) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| language_config = kwargs.get("language_config", {}) | ||||||||||||||||||||
| # remove kv_lora_rank if not specified, passing None is prohibited | ||||||||||||||||||||
| if language_config.get("kv_lora_rank") is None: | ||||||||||||||||||||
| language_config.pop("kv_lora_rank", None) | ||||||||||||||||||||
| self.text_config = DeepseekV2Config(**language_config) | ||||||||||||||||||||
|
Comment on lines
116
to
117
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This revert re-introduces a potential bug. If The original change attempted to fix this by removing the key, but that seems to have caused other issues, likely due to in-place modification of the Consider this alternative which should be safer and prevent both the
Suggested change
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| self.tile_tag = tile_tag | ||||||||||||||||||||
|
|
||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
|
|
||
| from transformers.configuration_utils import PretrainedConfig | ||
|
|
||
|
|
||
| class Olmo3Config(PretrainedConfig): | ||
| model_type = "olmo3" | ||
| keys_to_ignore_at_inference = ["past_key_values"] | ||
|
|
||
| def __init__( | ||
| self, | ||
| vocab_size=50304, | ||
| hidden_size=4096, | ||
| intermediate_size=11008, | ||
| num_hidden_layers=32, | ||
| num_attention_heads=32, | ||
| num_key_value_heads=None, | ||
| hidden_act="silu", | ||
| max_position_embeddings=2048, | ||
| initializer_range=0.02, | ||
| use_cache=True, | ||
| pad_token_id=1, | ||
| bos_token_id=None, | ||
| eos_token_id=50279, | ||
| tie_word_embeddings=False, | ||
| rope_parameters=None, | ||
| attention_bias=False, | ||
| attention_dropout=0.0, | ||
| rms_norm_eps=1e-5, | ||
| sliding_window=4096, | ||
| layer_types=None, | ||
| **kwargs, | ||
| ): | ||
| # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM | ||
| # in vLLM. | ||
| if "architectures" not in kwargs: | ||
| kwargs["architectures"] = ["Olmo2ForCausalLM"] | ||
| elif "Olmo3ForCausalLM" in kwargs["architectures"]: | ||
| kwargs["architectures"].remove("Olmo3ForCausalLM") | ||
| kwargs["architectures"].append("Olmo2ForCausalLM") | ||
|
|
||
| super().__init__( | ||
| pad_token_id=pad_token_id, | ||
| bos_token_id=bos_token_id, | ||
| eos_token_id=eos_token_id, | ||
| tie_word_embeddings=tie_word_embeddings, | ||
| **kwargs, | ||
| ) | ||
| self.vocab_size = vocab_size | ||
| self.max_position_embeddings = max_position_embeddings | ||
| self.hidden_size = hidden_size | ||
| self.intermediate_size = intermediate_size | ||
| self.num_hidden_layers = num_hidden_layers | ||
| self.num_attention_heads = num_attention_heads | ||
|
|
||
| # for backward compatibility | ||
| if num_key_value_heads is None: | ||
| num_key_value_heads = num_attention_heads | ||
|
|
||
| self.num_key_value_heads = num_key_value_heads | ||
| self.hidden_act = hidden_act | ||
| self.initializer_range = initializer_range | ||
| self.use_cache = use_cache | ||
| # Try to set `rope_scaling` if available, otherwise use `rope_parameters` | ||
| rope_scaling = kwargs.pop("rope_scaling", None) | ||
| rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} | ||
| rope_theta = kwargs.pop("rope_theta", 10000.0) | ||
| if "rope_theta" not in rope_parameters: | ||
| rope_parameters["rope_theta"] = rope_theta | ||
| self.rope_parameters = rope_parameters | ||
| self.attention_bias = attention_bias | ||
| self.attention_dropout = attention_dropout | ||
|
|
||
| self.rms_norm_eps = rms_norm_eps | ||
|
|
||
| self.sliding_window = sliding_window | ||
| self.layer_types = layer_types | ||
| if self.layer_types is None: | ||
| self.layer_types = [ | ||
| "sliding_attention" if (i + 1) % 4 != 0 else "full_attention" | ||
| for i in range(self.num_hidden_layers) | ||
| ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This revert re-introduces an issue where
sliding_window=0is not correctly handled as 'disabled'. Some models use0to indicate that sliding window attention is disabled, but vLLM expectsNone. Without this conversion, asliding_windowof0might be passed to attention layers, which could lead to incorrect behavior or errors.The original change seems correct and it's likely this was reverted as part of a larger revert. It should be re-introduced to prevent issues with models that use
sliding_window: 0.