Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion tests/entrypoints/offline_mode/test_offline_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ def _re_import_modules():
aliased_module_patterns = [
r".+\.tokenization_utils$",
r".+\.tokenization_utils_fast$",
r".+\.image_processing_utils_fast$",
r".+\.models\..+\.image_processing_.+_fast$",
]

Expand Down
8 changes: 0 additions & 8 deletions vllm/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,14 +586,6 @@ def __post_init__(
config_format=self.config_format,
)

# Some checkpoints set sliding_window to 0 to indicate that sliding window is
# disabled, but vLLM uses None for that. Convert 0 to None to avoid errors.
# Set before get_and_verify_max_len to ensure that max_model_len does not get
# capped to 0.
if self.get_sliding_window() == 0:
self.disable_sliding_window = True
self.hf_text_config.sliding_window = None

self.original_max_model_len = self.max_model_len
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
Comment on lines 589 to 590
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This revert re-introduces an issue where sliding_window=0 is not correctly handled as 'disabled'. Some models use 0 to indicate that sliding window attention is disabled, but vLLM expects None. Without this conversion, a sliding_window of 0 might be passed to attention layers, which could lead to incorrect behavior or errors.

The original change seems correct and it's likely this was reverted as part of a larger revert. It should be re-introduced to prevent issues with models that use sliding_window: 0.

        # Some checkpoints set sliding_window to 0 to indicate that sliding window is
        # disabled, but vLLM uses None for that. Convert 0 to None to avoid errors.
        # Set before get_and_verify_max_len to ensure that max_model_len does not get
        # capped to 0.
        if self.get_sliding_window() == 0:
            self.disable_sliding_window = True
            self.hf_text_config.sliding_window = None

        self.original_max_model_len = self.max_model_len
        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)


Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/olmo2.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

import torch
from torch import nn
from transformers import Olmo2Config, Olmo3Config
from transformers import Olmo2Config

from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
Expand Down Expand Up @@ -63,6 +63,7 @@
maybe_prefix,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.olmo3 import Olmo3Config


class Olmo2Attention(nn.Module):
Expand Down
1 change: 1 addition & 0 deletions vllm/transformers_utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def __getitem__(self, key):
eagle="EAGLEConfig",
speculators="SpeculatorsConfig",
nemotron="NemotronConfig",
olmo3="Olmo3Config",
olmo_hybrid="OlmoHybridConfig",
ovis="OvisConfig",
ultravox="UltravoxConfig",
Expand Down
2 changes: 2 additions & 0 deletions vllm/transformers_utils/configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"KimiK25Config": "vllm.transformers_utils.configs.kimi_k25",
"NemotronConfig": "vllm.transformers_utils.configs.nemotron",
"NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
"Olmo3Config": "vllm.transformers_utils.configs.olmo3",
"OlmoHybridConfig": "vllm.transformers_utils.configs.olmo_hybrid",
"OvisConfig": "vllm.transformers_utils.configs.ovis",
"PixelShuffleSiglip2VisionConfig": "vllm.transformers_utils.configs.isaac",
Expand Down Expand Up @@ -105,6 +106,7 @@
"KimiK25Config",
"NemotronConfig",
"NemotronHConfig",
"Olmo3Config",
"OlmoHybridConfig",
"OvisConfig",
"PixelShuffleSiglip2VisionConfig",
Expand Down
3 changes: 0 additions & 3 deletions vllm/transformers_utils/configs/deepseek_vl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,6 @@ def __init__(
self.projector_config = MlpProjectorConfig(**projector_config)

language_config = kwargs.get("language_config", {})
# remove kv_lora_rank if not specified, passing None is prohibited
if language_config.get("kv_lora_rank") is None:
language_config.pop("kv_lora_rank", None)
self.text_config = DeepseekV2Config(**language_config)
Comment on lines 116 to 117
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This revert re-introduces a potential bug. If language_config contains kv_lora_rank: None, it will be passed to DeepseekV2Config, which can cause a TypeError later on when kv_lora_rank is used in arithmetic operations (e.g., in DeepseekV2Attention).

The original change attempted to fix this by removing the key, but that seems to have caused other issues, likely due to in-place modification of the language_config dictionary. A safer fix that avoids this side effect is to create a copy before modification.

Consider this alternative which should be safer and prevent both the TypeError and the CI failures seen in the original PR:

Suggested change
language_config = kwargs.get("language_config", {})
# remove kv_lora_rank if not specified, passing None is prohibited
if language_config.get("kv_lora_rank") is None:
language_config.pop("kv_lora_rank", None)
self.text_config = DeepseekV2Config(**language_config)
language_config = kwargs.get("language_config", {}).copy()
if language_config.get("kv_lora_rank") is None:
language_config.pop("kv_lora_rank", None)
self.text_config = DeepseekV2Config(**language_config)


self.tile_tag = tile_tag
Expand Down
83 changes: 83 additions & 0 deletions vllm/transformers_utils/configs/olmo3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from transformers.configuration_utils import PretrainedConfig


class Olmo3Config(PretrainedConfig):
model_type = "olmo3"
keys_to_ignore_at_inference = ["past_key_values"]

def __init__(
self,
vocab_size=50304,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
use_cache=True,
pad_token_id=1,
bos_token_id=None,
eos_token_id=50279,
tie_word_embeddings=False,
rope_parameters=None,
attention_bias=False,
attention_dropout=0.0,
rms_norm_eps=1e-5,
sliding_window=4096,
layer_types=None,
**kwargs,
):
# This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
# in vLLM.
if "architectures" not in kwargs:
kwargs["architectures"] = ["Olmo2ForCausalLM"]
elif "Olmo3ForCausalLM" in kwargs["architectures"]:
kwargs["architectures"].remove("Olmo3ForCausalLM")
kwargs["architectures"].append("Olmo2ForCausalLM")

super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads

# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads

self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.use_cache = use_cache
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout

self.rms_norm_eps = rms_norm_eps

self.sliding_window = sliding_window
self.layer_types = layer_types
if self.layer_types is None:
self.layer_types = [
"sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
for i in range(self.num_hidden_layers)
]
Loading