Skip to content
Merged
101 changes: 22 additions & 79 deletions tests/models/language/pooling/test_nomic_max_model_len.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: SIM117
from typing import Any

import pytest

Expand Down Expand Up @@ -40,8 +39,8 @@ def test_default(model_info, vllm_runner):
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
assert model_config.max_model_len == 512
else:
assert model_config.max_model_len == original_max_position_embeddings
if model_info.name == "nomic-ai/nomic-embed-text-v1":
assert model_config.max_model_len == 8192


@pytest.mark.parametrize("model_info", MODELS)
Expand All @@ -56,10 +55,9 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256

# set 512 < max_model_len <= 2048
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
with pytest.raises(ValueError):
with vllm_runner(
model_info.name,
Expand All @@ -68,40 +66,27 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
max_model_len=1024,
):
pass
else:
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=1024,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024
return

# set 512 < max_model_len <= 2048
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=1024,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024

@pytest.mark.parametrize("model_info", MODELS)
def test_set_max_model_len_illegal(model_info, vllm_runner):
# set max_model_len > 2048
with pytest.raises(ValueError):
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=4096,
):
pass

# set max_model_len > 2048 by hf_overrides
hf_overrides = {"max_model_len": 4096}
with pytest.raises(ValueError):
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides,
):
pass
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=4096,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 4096


@pytest.mark.parametrize("model_info", MODELS)
Expand All @@ -124,45 +109,3 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
hf_overrides=hf_overrides,
):
pass


@pytest.mark.parametrize("model_info", MODELS)
def test_use_rope_scaling_illegal(model_info, vllm_runner):
hf_overrides: dict[str, Any] = {
"rope_parameters": {
"rope_theta": rope_theta,
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings": original_max_position_embeddings,
},
}
# illegal max_model_len
with pytest.raises(ValueError):
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=max_model_len + 1,
hf_overrides=hf_overrides,
):
pass

hf_overrides = {
"rope_parameters": {
"rope_theta": rope_theta,
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings": original_max_position_embeddings,
},
"max_model_len": max_model_len + 1,
}
# illegal max_model_len by hf_overrides
with pytest.raises(ValueError):
with vllm_runner(
model_info.name,
revision=model_info.revision,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides,
):
pass
4 changes: 4 additions & 0 deletions vllm/model_executor/layers/rotary_embedding/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,10 +211,14 @@ def get_rope(
)
elif "factor" in rope_parameters:
scaling_factor = rope_parameters["factor"]
max_trained_positions = rope_parameters.get(
"max_trained_positions", max_position
)
Comment thread
maxdebayser marked this conversation as resolved.
rotary_emb = DynamicNTKScalingRotaryEmbedding(
head_size,
rotary_dim,
max_position,
max_trained_positions,
base,
is_neox_style,
scaling_factor,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,14 @@ def __init__(
head_size: int,
rotary_dim: int,
max_position_embeddings: int,
max_trained_positions: int,
base: float,
is_neox_style: bool,
scaling_factor: float,
dtype: torch.dtype,
) -> None:
self.scaling_factor = scaling_factor
self.max_trained_positions = max_trained_positions
super().__init__(
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
)
Expand All @@ -53,13 +55,16 @@ def _compute_cos_sin_cache(self) -> torch.Tensor:
# maximum length before applying the rope scaling.
# Thus, the maximum length after applying the rope scaling is
# self.max_position_embeddings * self.scaling_factor.
max_len = self.max_position_embeddings * self.scaling_factor
base = self.base * (
(self.scaling_factor * max_len / self.max_position_embeddings)
(
self.scaling_factor
* self.max_position_embeddings
/ self.max_trained_positions
)
- (self.scaling_factor - 1)
) ** (self.rotary_dim / (self.rotary_dim - 2))
inv_freq = self._compute_inv_freq(base)
t = torch.arange(max_len, dtype=torch.float)
t = torch.arange(self.max_position_embeddings, dtype=torch.float)

Comment on lines 57 to 68
Copy link
Copy Markdown
Collaborator

@noooop noooop Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @tjtanaa @WoosukKwon @mgoin @Isotr0py

Could you please help double-check this?

Copy link
Copy Markdown
Collaborator

@noooop noooop May 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @vadiklyutiy

Could you please help double-check this?

CODEOWNERS for rotary_embedding have not been updated.

/vllm/model_executor/layers/rotary_embedding.py @vadiklyutiy

Copy link
Copy Markdown
Contributor

@ieBoytsov ieBoytsov May 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@noooop the fix looks good to me. Nice to see example above with embeddings comparison of sentence-transformers and vllm. @maxdebayser @noooop maybe it makes sense to add it as a test? As far as i see current max len test only checks for correct config? For example, for ColBERT models we were adding tests that compared HF embeddings are equal to vllm embeddings.

UPD: i see there does exists a test for embeds comparison. Din't see yet whether it tests it with extended context.

Copy link
Copy Markdown
Collaborator

@noooop noooop May 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hello @mgoin

image

These code changes date back to September 2023.

Can we safely land this PR, or should we have more people double-check it?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this change makes a lot of sense and I appreciate Max's justification, I'm good with landing it.

freqs = torch.einsum("i,j -> ij", t, inv_freq)
cos = freqs.cos()
Expand Down
81 changes: 11 additions & 70 deletions vllm/model_executor/models/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from copy import deepcopy
from typing import TYPE_CHECKING

from vllm.logger import init_logger
Expand Down Expand Up @@ -473,80 +472,22 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None:
)

head_dim = config.hidden_size // config.num_attention_heads
max_trained_positions = getattr(config, "max_trained_positions", 2048)
max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
max_trained_positions = getattr(
config, "max_trained_positions", max_position_embeddings
)

rope_parameters = {
"max_trained_positions": max_trained_positions,
**(config.rope_parameters or {}),
}
Comment thread
maxdebayser marked this conversation as resolved.

config.rotary_kwargs = {
"head_size": head_dim,
"max_position": max_trained_positions,
"rope_parameters": config.rope_parameters,
"max_position": model_config.max_model_len,
"rope_parameters": rope_parameters,
}

# we ignore config.rotary_scaling_factor so that for datasets shorter
# than max_trained_positions 2048, the results are consistent
# with SentenceTransformer.
# The context extension uses vllm style rope_theta and rope_parameters.
Comment thread
noooop marked this conversation as resolved.
# See #17785 #18755
if (
not model_config.hf_overrides
and model_config.original_max_model_len is None
):
# Default
# Reset max_model_len to max_trained_positions.
# nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
max_model_len_before = model_config.max_model_len
max_model_len = min(model_config.max_model_len, max_trained_positions)

model_config.max_model_len = model_config.get_and_verify_max_len(
max_model_len
)

if model_config.max_model_len != max_model_len_before:
logger.warning(
"Nomic context extension is disabled. "
"Changing max_model_len from %s to %s. "
"To enable context extension, see: "
"https://github.com/vllm-project/vllm/tree/main/examples/features/context_extension/context_extension_offline.py",
max_model_len_before,
model_config.max_model_len,
)
else:
# We need to re-verify max_model_len to avoid lengths
# greater than position_embedding.
hf_text_config = model_config.hf_text_config

if isinstance(model_config.hf_overrides, dict):
# hf_overrides_kw
max_model_len = model_config.hf_overrides.get(
"max_model_len", model_config.max_model_len
)
else:
# hf_overrides_fn
# This might be overridden by sentence_bert_config.json.
max_model_len = model_config.max_model_len

# reset hf_text_config for recalculate_max_model_len.
if hasattr(hf_text_config, "max_model_len"):
delattr(hf_text_config, "max_model_len")
hf_text_config.max_position_embeddings = max_trained_positions
hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"]

# Update the cached derived_max_model_len to enforce the limit
model_config.model_arch_config.derived_max_model_len_and_key = (
float(max_trained_positions),
"max_position_embeddings",
)

# The priority of sentence_bert_config.json is higher
# than max_position_embeddings
encoder_config = deepcopy(model_config.encoder_config)
encoder_config.pop("max_seq_length", None)
model_config.encoder_config = encoder_config

model_config.max_model_len = model_config.get_and_verify_max_len(
max_model_len
)


class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig):
@staticmethod
Expand Down
Loading