diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index b5630ca968ce..6ea29f6d0677 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 -from typing import Any import pytest @@ -40,8 +39,8 @@ def test_default(model_info, vllm_runner): # For nomic-embed-text-v2-moe the length is set to 512 # by sentence_bert_config.json. assert model_config.max_model_len == 512 - else: - assert model_config.max_model_len == original_max_position_embeddings + if model_info.name == "nomic-ai/nomic-embed-text-v1": + assert model_config.max_model_len == 8192 @pytest.mark.parametrize("model_info", MODELS) @@ -56,10 +55,9 @@ def test_set_max_model_len_legal(model_info, vllm_runner): model_config = vllm_model.llm.llm_engine.model_config assert model_config.max_model_len == 256 - # set 512 < max_model_len <= 2048 + # For nomic-embed-text-v2-moe the length is set to 512 + # by sentence_bert_config.json. if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": - # For nomic-embed-text-v2-moe the length is set to 512 - # by sentence_bert_config.json. with pytest.raises(ValueError): with vllm_runner( model_info.name, @@ -68,40 +66,27 @@ def test_set_max_model_len_legal(model_info, vllm_runner): max_model_len=1024, ): pass - else: - with vllm_runner( - model_info.name, - revision=model_info.revision, - runner="pooling", - max_model_len=1024, - ) as vllm_model: - model_config = vllm_model.llm.llm_engine.model_config - assert model_config.max_model_len == 1024 + return + # set 512 < max_model_len <= 2048 + with vllm_runner( + model_info.name, + revision=model_info.revision, + runner="pooling", + max_model_len=1024, + ) as vllm_model: + model_config = vllm_model.llm.llm_engine.model_config + assert model_config.max_model_len == 1024 -@pytest.mark.parametrize("model_info", MODELS) -def test_set_max_model_len_illegal(model_info, vllm_runner): # set max_model_len > 2048 - with pytest.raises(ValueError): - with vllm_runner( - model_info.name, - revision=model_info.revision, - runner="pooling", - max_model_len=4096, - ): - pass - - # set max_model_len > 2048 by hf_overrides - hf_overrides = {"max_model_len": 4096} - with pytest.raises(ValueError): - with vllm_runner( - model_info.name, - revision=model_info.revision, - runner="pooling", - max_model_len=None, - hf_overrides=hf_overrides, - ): - pass + with vllm_runner( + model_info.name, + revision=model_info.revision, + runner="pooling", + max_model_len=4096, + ) as vllm_model: + model_config = vllm_model.llm.llm_engine.model_config + assert model_config.max_model_len == 4096 @pytest.mark.parametrize("model_info", MODELS) @@ -124,45 +109,3 @@ def test_use_rope_scaling_legal(model_info, vllm_runner): hf_overrides=hf_overrides, ): pass - - -@pytest.mark.parametrize("model_info", MODELS) -def test_use_rope_scaling_illegal(model_info, vllm_runner): - hf_overrides: dict[str, Any] = { - "rope_parameters": { - "rope_theta": rope_theta, - "rope_type": "yarn", - "factor": factor, - "original_max_position_embeddings": original_max_position_embeddings, - }, - } - # illegal max_model_len - with pytest.raises(ValueError): - with vllm_runner( - model_info.name, - revision=model_info.revision, - runner="pooling", - max_model_len=max_model_len + 1, - hf_overrides=hf_overrides, - ): - pass - - hf_overrides = { - "rope_parameters": { - "rope_theta": rope_theta, - "rope_type": "yarn", - "factor": factor, - "original_max_position_embeddings": original_max_position_embeddings, - }, - "max_model_len": max_model_len + 1, - } - # illegal max_model_len by hf_overrides - with pytest.raises(ValueError): - with vllm_runner( - model_info.name, - revision=model_info.revision, - runner="pooling", - max_model_len=None, - hf_overrides=hf_overrides, - ): - pass diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index b955b37be603..0a69b3b3f1d3 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -211,10 +211,14 @@ def get_rope( ) elif "factor" in rope_parameters: scaling_factor = rope_parameters["factor"] + max_trained_positions = rope_parameters.get( + "max_trained_positions", max_position + ) rotary_emb = DynamicNTKScalingRotaryEmbedding( head_size, rotary_dim, max_position, + max_trained_positions, base, is_neox_style, scaling_factor, diff --git a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py index 28fd87ecc21f..8a48be490b6f 100644 --- a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py @@ -38,12 +38,14 @@ def __init__( head_size: int, rotary_dim: int, max_position_embeddings: int, + max_trained_positions: int, base: float, is_neox_style: bool, scaling_factor: float, dtype: torch.dtype, ) -> None: self.scaling_factor = scaling_factor + self.max_trained_positions = max_trained_positions super().__init__( head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype ) @@ -53,13 +55,16 @@ def _compute_cos_sin_cache(self) -> torch.Tensor: # maximum length before applying the rope scaling. # Thus, the maximum length after applying the rope scaling is # self.max_position_embeddings * self.scaling_factor. - max_len = self.max_position_embeddings * self.scaling_factor base = self.base * ( - (self.scaling_factor * max_len / self.max_position_embeddings) + ( + self.scaling_factor + * self.max_position_embeddings + / self.max_trained_positions + ) - (self.scaling_factor - 1) ) ** (self.rotary_dim / (self.rotary_dim - 2)) inv_freq = self._compute_inv_freq(base) - t = torch.arange(max_len, dtype=torch.float) + t = torch.arange(self.max_position_embeddings, dtype=torch.float) freqs = torch.einsum("i,j -> ij", t, inv_freq) cos = freqs.cos() diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 422acca642a9..133e1c19209b 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from copy import deepcopy from typing import TYPE_CHECKING from vllm.logger import init_logger @@ -473,80 +472,22 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: ) head_dim = config.hidden_size // config.num_attention_heads - max_trained_positions = getattr(config, "max_trained_positions", 2048) + max_position_embeddings = getattr(config, "max_position_embeddings", 2048) + max_trained_positions = getattr( + config, "max_trained_positions", max_position_embeddings + ) + + rope_parameters = { + "max_trained_positions": max_trained_positions, + **(config.rope_parameters or {}), + } config.rotary_kwargs = { "head_size": head_dim, - "max_position": max_trained_positions, - "rope_parameters": config.rope_parameters, + "max_position": model_config.max_model_len, + "rope_parameters": rope_parameters, } - # we ignore config.rotary_scaling_factor so that for datasets shorter - # than max_trained_positions 2048, the results are consistent - # with SentenceTransformer. - # The context extension uses vllm style rope_theta and rope_parameters. - # See #17785 #18755 - if ( - not model_config.hf_overrides - and model_config.original_max_model_len is None - ): - # Default - # Reset max_model_len to max_trained_positions. - # nomic-embed-text-v2-moe the length is set to 512 - # by sentence_bert_config.json. - max_model_len_before = model_config.max_model_len - max_model_len = min(model_config.max_model_len, max_trained_positions) - - model_config.max_model_len = model_config.get_and_verify_max_len( - max_model_len - ) - - if model_config.max_model_len != max_model_len_before: - logger.warning( - "Nomic context extension is disabled. " - "Changing max_model_len from %s to %s. " - "To enable context extension, see: " - "https://github.com/vllm-project/vllm/tree/main/examples/features/context_extension/context_extension_offline.py", - max_model_len_before, - model_config.max_model_len, - ) - else: - # We need to re-verify max_model_len to avoid lengths - # greater than position_embedding. - hf_text_config = model_config.hf_text_config - - if isinstance(model_config.hf_overrides, dict): - # hf_overrides_kw - max_model_len = model_config.hf_overrides.get( - "max_model_len", model_config.max_model_len - ) - else: - # hf_overrides_fn - # This might be overridden by sentence_bert_config.json. - max_model_len = model_config.max_model_len - - # reset hf_text_config for recalculate_max_model_len. - if hasattr(hf_text_config, "max_model_len"): - delattr(hf_text_config, "max_model_len") - hf_text_config.max_position_embeddings = max_trained_positions - hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"] - - # Update the cached derived_max_model_len to enforce the limit - model_config.model_arch_config.derived_max_model_len_and_key = ( - float(max_trained_positions), - "max_position_embeddings", - ) - - # The priority of sentence_bert_config.json is higher - # than max_position_embeddings - encoder_config = deepcopy(model_config.encoder_config) - encoder_config.pop("max_seq_length", None) - model_config.encoder_config = encoder_config - - model_config.max_model_len = model_config.get_and_verify_max_len( - max_model_len - ) - class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig): @staticmethod