diff --git a/python/sglang/multimodal_gen/runtime/loader/weight_utils.py b/python/sglang/multimodal_gen/runtime/loader/weight_utils.py index 89d22b31e84a..d2fa290210b7 100644 --- a/python/sglang/multimodal_gen/runtime/loader/weight_utils.py +++ b/python/sglang/multimodal_gen/runtime/loader/weight_utils.py @@ -11,7 +11,6 @@ from pathlib import Path import filelock -import huggingface_hub.constants import torch from safetensors.torch import safe_open from tqdm.auto import tqdm @@ -35,21 +34,6 @@ temp_dir = tempfile.gettempdir() -def enable_hf_transfer() -> None: - """automatically activates hf_transfer""" - if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ: - try: - # enable hf hub transfer if available - import hf_transfer # type: ignore # noqa - - huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True - except ImportError: - pass - - -enable_hf_transfer() - - class DisabledTqdm(tqdm): def __init__(self, *args, **kwargs): diff --git a/python/sglang/multimodal_gen/runtime/models/encoders/llama.py b/python/sglang/multimodal_gen/runtime/models/encoders/llama.py index ea208f1242f4..b7e0d513211d 100644 --- a/python/sglang/multimodal_gen/runtime/models/encoders/llama.py +++ b/python/sglang/multimodal_gen/runtime/models/encoders/llama.py @@ -226,8 +226,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): diff --git a/python/sglang/multimodal_gen/runtime/models/encoders/qwen3.py b/python/sglang/multimodal_gen/runtime/models/encoders/qwen3.py index b8132e4041c1..4d7bedf7a02e 100644 --- a/python/sglang/multimodal_gen/runtime/models/encoders/qwen3.py +++ b/python/sglang/multimodal_gen/runtime/models/encoders/qwen3.py @@ -204,8 +204,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000.0) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 1000000.0) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 40960) attention_bias = getattr(config, "attention_bias", False) diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py index 965b4e3052d0..28e14048be85 100644 --- a/python/sglang/srt/configs/__init__.py +++ b/python/sglang/srt/configs/__init__.py @@ -16,6 +16,7 @@ from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig from sglang.srt.configs.lfm2 import Lfm2Config from sglang.srt.configs.lfm2_moe import Lfm2MoeConfig +from sglang.srt.configs.lfm2_vl import Lfm2VlConfig from sglang.srt.configs.longcat_flash import LongcatFlashConfig from sglang.srt.configs.nano_nemotron_vl import NemotronH_Nano_VL_V2_Config from sglang.srt.configs.nemotron_h import NemotronHConfig @@ -54,6 +55,7 @@ "FalconH1Config", "Lfm2Config", "Lfm2MoeConfig", + "Lfm2VlConfig", "NemotronHConfig", "NemotronH_Nano_VL_V2_Config", "JetNemotronConfig", diff --git a/python/sglang/srt/configs/lfm2_vl.py b/python/sglang/srt/configs/lfm2_vl.py new file mode 100644 index 000000000000..13ea2e9fc617 --- /dev/null +++ b/python/sglang/srt/configs/lfm2_vl.py @@ -0,0 +1,103 @@ +# Copyright 2025 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch LFM2-VL model.""" + +from transformers.configuration_utils import PreTrainedConfig +# TODO: replace this with the sglang logger? +import logging +from transformers import CONFIG_MAPPING, AutoConfig + + +logger = logging.getLogger(__name__) + + +class Lfm2VlConfig(PreTrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Lfm2VlForConditionalGeneration`]. It is used to instantiate an + Lfm2Vl model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Lfm2-VL-1.6B. + + e.g. [LiquidAI/LFM2-VL-1.6B](https://huggingface.co/LiquidAI/LFM2-VL-1.6B) + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + + Args: + vision_config (`AutoConfig | dict`, *optional*, defaults to `Siglip2ImageConfig`): + The config object or dictionary of the vision backbone. + text_config (`AutoConfig | dict`, *optional*, defaults to `Lfm2Config`): + The config object or dictionary of the text backbone. + image_token_id (`int`, *optional*, defaults to 396): + The image token index to encode the image prompt. + projector_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The activation function used by the multimodal projector. + projector_hidden_size (`int`, *optional*, defaults to 2560): + The hidden size of the multimodal projector. + projector_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in the multimodal projector. + projector_use_layernorm (`bool`, *optional*, defaults to `True`): + Whether to use layernorm in the multimodal projector. + downsample_factor (`int`, *optional*, defaults to 2): + The downsample_factor factor of the vision backbone. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie the word embeddings of the text backbone. + """ + + model_type = "lfm2_vl" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + image_token_id=396, + projector_hidden_act="gelu", + projector_hidden_size=2560, + projector_bias=True, + projector_use_layernorm=True, + downsample_factor=2, + tie_word_embeddings=True, + **kwargs, + ): + self.image_token_id = image_token_id + self.projector_hidden_act = projector_hidden_act + self.projector_hidden_size = projector_hidden_size + self.projector_bias = projector_bias + self.projector_use_layernorm = projector_use_layernorm + self.downsample_factor = downsample_factor + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip2_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["siglip2_vision_model"]() + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "lfm2") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["lfm2"]() + + self.vision_config = vision_config + self.text_config = text_config + self.tie_word_embeddings = getattr(text_config, "tie_embedding", tie_word_embeddings) + + super().__init__(**kwargs) + +# Override HuggingFace's Lfm2VlConfig with our version +# Cannot use .register() because lfm2_vl may already be registered by transformers +# Directly modify the internal _extra_content dict instead +CONFIG_MAPPING._extra_content["lfm2_vl"] = Lfm2VlConfig + +__all__ = ["Lfm2VlConfig"] diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 701efb326d4e..20aac8371c36 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -1009,7 +1009,8 @@ def _verify_transformers_version(self): # The vision config model type for GLM-4.5v is 'glm4v_moe', # while for GLM-4.6v, it is 'glm4v_moe_vision'. ) - needs_tf_v5 = is_glm_46vmoe + is_lfm2_vl = getattr(self.hf_config, "model_type", None) == "lfm2_vl" + needs_tf_v5 = is_glm_46vmoe or is_lfm2_vl tf_version = version.parse(tf_version_str) required_version = version.parse("5.0.0dev0") @@ -1231,6 +1232,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal "Mistral3ForConditionalGeneration", "MultiModalityCausalLM", "MllamaForConditionalGeneration", + "Lfm2VlForConditionalGeneration", "NemotronH_Nano_VL_V2", "PixtralForConditionalGeneration", "Qwen2AudioForConditionalGeneration", diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 0d6d64986dba..09efc5c91b19 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -38,6 +38,7 @@ KimiLinearConfig, Lfm2Config, Lfm2MoeConfig, + Lfm2VlConfig, NemotronH_Nano_VL_V2_Config, NemotronHConfig, Qwen3_5Config, @@ -1594,6 +1595,8 @@ def mamba2_config(self): return config if isinstance(config, NemotronH_Nano_VL_V2_Config): return config.llm_config + if isinstance(config, Lfm2VlConfig): + return config.text_config return None @property diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py index dac21905886a..de7e320d34c9 100644 --- a/python/sglang/srt/model_loader/weight_utils.py +++ b/python/sglang/srt/model_loader/weight_utils.py @@ -67,21 +67,6 @@ logger = logging.getLogger(__name__) -def enable_hf_transfer(): - """automatically activates hf_transfer""" - if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ: - try: - # enable hf hub transfer if available - import hf_transfer # type: ignore # noqa - - huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True - except ImportError: - pass - - -enable_hf_transfer() - - # use system-level temp directory for file locks, so that multiple users # can share the same lock without error. # lock files in the temp directory will be automatically deleted when the diff --git a/python/sglang/srt/models/afmoe.py b/python/sglang/srt/models/afmoe.py index 92a11b09af03..c70289819483 100644 --- a/python/sglang/srt/models/afmoe.py +++ b/python/sglang/srt/models/afmoe.py @@ -29,7 +29,6 @@ import torch import torch.nn.functional as F from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -59,6 +58,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix +from transformers import PretrainedConfig def get_attention_sliding_window_size(config: PretrainedConfig) -> Optional[int]: @@ -314,8 +314,8 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) self.rotary_dim = int(self.head_dim * partial_rotary_factor) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) diff --git a/python/sglang/srt/models/apertus.py b/python/sglang/srt/models/apertus.py index ca84264b9362..b17dee2718e7 100644 --- a/python/sglang/srt/models/apertus.py +++ b/python/sglang/srt/models/apertus.py @@ -22,7 +22,6 @@ import torch from torch import nn -from transformers import ApertusConfig from sglang.srt.distributed import ( get_pp_group, @@ -54,6 +53,7 @@ ) from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import add_prefix, make_layers +from transformers import ApertusConfig logger = logging.getLogger(__name__) @@ -217,8 +217,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): diff --git a/python/sglang/srt/models/arcee.py b/python/sglang/srt/models/arcee.py index 5afd5f34f5dd..8d07db7b8e1f 100644 --- a/python/sglang/srt/models/arcee.py +++ b/python/sglang/srt/models/arcee.py @@ -18,7 +18,6 @@ import torch from torch import nn -from transformers import LlamaConfig from sglang.srt.distributed import ( get_pp_group, @@ -50,6 +49,7 @@ ) from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import add_prefix, make_layers +from transformers import LlamaConfig logger = logging.getLogger(__name__) @@ -199,8 +199,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): diff --git a/python/sglang/srt/models/baichuan.py b/python/sglang/srt/models/baichuan.py index 84596ba1f207..398676d1e557 100644 --- a/python/sglang/srt/models/baichuan.py +++ b/python/sglang/srt/models/baichuan.py @@ -23,7 +23,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -47,6 +46,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, is_npu +from transformers import PretrainedConfig _is_npu = is_npu() @@ -228,7 +228,7 @@ def __init__( ): super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) + rope_theta = config.rope_parameters.get("rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = BaiChuanAttention( hidden_size=self.hidden_size, diff --git a/python/sglang/srt/models/bailing_moe.py b/python/sglang/srt/models/bailing_moe.py index a04e3d64792c..7f366ee19369 100644 --- a/python/sglang/srt/models/bailing_moe.py +++ b/python/sglang/srt/models/bailing_moe.py @@ -24,7 +24,6 @@ import torch import torch.nn.functional as F from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import ( get_pp_group, @@ -82,6 +81,7 @@ ) from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty, make_layers +from transformers import PretrainedConfig LoraConfig = None logger = logging.getLogger(__name__) @@ -497,8 +497,8 @@ def __init__( self.head_dim, rotary_dim=self.rotary_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, - rope_scaling=config.rope_scaling, + base=config.rope_parameters.get("rope_theta", 10000), + rope_scaling=config.rope_parameters.get("rope_scaling"), ) self.attn = RadixAttention( diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py index 7c799f5f8400..cd81f955a780 100644 --- a/python/sglang/srt/models/commandr.py +++ b/python/sglang/srt/models/commandr.py @@ -43,7 +43,6 @@ import torch.utils.checkpoint from torch import nn from torch.nn.parameter import Parameter -from transformers import Cohere2Config, CohereConfig, PretrainedConfig from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -66,6 +65,7 @@ maybe_remap_kv_scale_name, ) from sglang.srt.utils import add_prefix, get_compiler_backend, set_weight_attrs +from transformers import Cohere2Config, CohereConfig, PretrainedConfig @torch.compile(backend=get_compiler_backend()) @@ -171,8 +171,8 @@ def __init__( self.max_position_embeddings = getattr( config, "model_max_length", None ) or getattr(config, "max_position_embeddings", 8192) - self.rope_theta = config.rope_theta - self.rope_scaling = getattr(config, "rope_scaling", None) + self.rope_theta = config.rope_parameters.get("rope_theta", 10000) + self.rope_scaling = config.rope_parameters.get("rope_scaling") self.use_qk_norm = getattr(config, "use_qk_norm", False) self.qkv_proj = QKVParallelLinear( self.hidden_size, diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py index 74de384b3395..db350ba4c391 100644 --- a/python/sglang/srt/models/dbrx.py +++ b/python/sglang/srt/models/dbrx.py @@ -205,7 +205,7 @@ def __init__( self.head_dim = self.d_model // self.total_num_heads self.total_num_kv_heads = config.attn_config.kv_n_heads self.clip_qkv = config.attn_config.clip_qkv - self.rope_theta = config.attn_config.rope_theta + self.rope_theta = config.attn_config.rope_parameters.get("rope_theta", 10000) self.max_position = config.max_seq_len # pylint: disable=invalid-name diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py index ef431e00d460..f97273d0b1a5 100644 --- a/python/sglang/srt/models/deepseek.py +++ b/python/sglang/srt/models/deepseek.py @@ -20,7 +20,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -49,6 +48,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix +from transformers import PretrainedConfig class DeepseekMLP(nn.Module): @@ -288,8 +288,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = DeepseekAttention( hidden_size=self.hidden_size, diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 4b5eeb9517cd..690b748a1772 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -25,7 +25,6 @@ import torch import torch.nn.functional as F from torch import nn -from transformers import PretrainedConfig from sglang.srt.batch_overlap.single_batch_overlap import SboFlags, compute_overlap_args from sglang.srt.batch_overlap.two_batch_overlap import ( @@ -156,6 +155,7 @@ make_layers, use_intel_amx_backend, ) +from transformers import PretrainedConfig if _use_aiter_gfx95: diff --git a/python/sglang/srt/models/ernie4.py b/python/sglang/srt/models/ernie4.py index dffd8f09a8bd..2d0d925076af 100644 --- a/python/sglang/srt/models/ernie4.py +++ b/python/sglang/srt/models/ernie4.py @@ -19,9 +19,6 @@ import torch import torch.nn.functional as F from torch import nn -from transformers.models.ernie4_5_moe.configuration_ernie4_5_moe import ( - Ernie4_5_MoeConfig, -) from sglang.srt.distributed import ( get_tensor_model_parallel_world_size, @@ -43,6 +40,9 @@ from sglang.srt.models.deepseek_v2 import DeepseekV2MLP as Ernie4MLP from sglang.srt.models.llama import LlamaAttention as Ernie4Attention from sglang.srt.utils import add_prefix, make_layers +from transformers.models.ernie4_5_moe.configuration_ernie4_5_moe import ( + Ernie4_5_MoeConfig, +) class MoEGate(nn.Module): @@ -155,8 +155,8 @@ def __init__( is_mtp: bool = False, ): super().__init__() - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") rope_is_neox_style = getattr(config, "rope_is_neox_style", False) # Self attention. self.self_attn = Ernie4Attention( diff --git a/python/sglang/srt/models/ernie45_moe_vl.py b/python/sglang/srt/models/ernie45_moe_vl.py index 3fe0fc6a77e5..5791f85b2a01 100644 --- a/python/sglang/srt/models/ernie45_moe_vl.py +++ b/python/sglang/srt/models/ernie45_moe_vl.py @@ -20,7 +20,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import ( get_pp_group, @@ -44,6 +43,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors from sglang.srt.models.deepseek_v2 import DeepseekV2MLP as Ernie4_5_VLMoeMLP from sglang.srt.utils import add_prefix, make_layers +from transformers import PretrainedConfig logger = logging.getLogger(__name__) @@ -368,8 +368,8 @@ def __init__( prefix: str = "", ): super().__init__() - rope_theta = getattr(config, "rope_theta", 500000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 500000) + rope_scaling = config.rope_parameters.get("rope_scaling") rope_is_neox_style = getattr(config, "rope_is_neox_style", False) freq_allocation = getattr(config, "freq_allocation", 20) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) diff --git a/python/sglang/srt/models/exaone.py b/python/sglang/srt/models/exaone.py index 1e4dfb3df217..cff647f6a96b 100644 --- a/python/sglang/srt/models/exaone.py +++ b/python/sglang/srt/models/exaone.py @@ -182,8 +182,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 500000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 500000) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): diff --git a/python/sglang/srt/models/falcon_h1.py b/python/sglang/srt/models/falcon_h1.py index 628f99c6e46e..0643ca822877 100644 --- a/python/sglang/srt/models/falcon_h1.py +++ b/python/sglang/srt/models/falcon_h1.py @@ -133,9 +133,9 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = getattr(config, "rope_theta", 10000) + self.rope_theta = config.rope_parameters.get("rope_theta", 10000) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - self.rope_scaling = getattr(config, "rope_scaling", None) + self.rope_scaling = config.rope_parameters.get("rope_scaling") self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1) self.layer_id = layer_id diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index 1ecb5011f71c..bddbcbef3ba3 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -20,7 +20,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import GeluAndMul @@ -38,6 +37,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix +from transformers import PretrainedConfig class GemmaMLP(nn.Module): @@ -172,7 +172,7 @@ def __init__( head_dim=config.head_dim, layer_id=layer_id, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_theta, + rope_theta=config.rope_parameters.get("rope_theta", 10000), quant_config=quant_config, prefix=add_prefix("self_attn", prefix), ) diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index 883eec81fe68..56d8333b6fad 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -19,7 +19,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import GeluAndMul @@ -40,6 +39,7 @@ maybe_remap_kv_scale_name, ) from sglang.srt.utils import add_prefix, is_npu, make_layers +from transformers import PretrainedConfig _is_npu = is_npu() @@ -217,7 +217,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_theta, + rope_theta=config.rope_parameters.get("rope_theta", 10000), quant_config=quant_config, prefix=add_prefix("self_attn", prefix), ) diff --git a/python/sglang/srt/models/gemma3_causal.py b/python/sglang/srt/models/gemma3_causal.py index 17c535d73d3f..3d17b7bfb39a 100644 --- a/python/sglang/srt/models/gemma3_causal.py +++ b/python/sglang/srt/models/gemma3_causal.py @@ -17,12 +17,6 @@ import einops import torch from torch import nn -from transformers import ( - ROPE_INIT_FUNCTIONS, - Gemma3TextConfig, - PretrainedConfig, - PreTrainedModel, -) from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import GeluAndMul @@ -43,6 +37,12 @@ maybe_remap_kv_scale_name, ) from sglang.srt.utils import add_prefix, make_layers +from transformers import ( + ROPE_INIT_FUNCTIONS, + Gemma3TextConfig, + PretrainedConfig, + PreTrainedModel, +) # Aligned with HF's implementation, using sliding window inclusive with the last token @@ -176,8 +176,8 @@ def __init__( self.sliding_window = get_attention_sliding_window_size(config) else: # Global attention. Use the values in config.json. - self.rope_theta = config.rope_theta - self.rope_scaling = config.rope_scaling + self.rope_theta = config.rope_parameters.get("rope_theta", 10000) + self.rope_scaling = config.rope_parameters.get("rope_scaling") self.sliding_window = None self.attn = RadixAttention( @@ -325,9 +325,10 @@ class Gemma3RotaryEmbedding(nn.Module): def __init__(self, config: Gemma3TextConfig, device=None): super().__init__() # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get( - "rope_type", config.rope_scaling.get("type", "default") + rope_scaling = config.rope_parameters.get("rope_scaling") + if rope_scaling is not None: + self.rope_type = rope_scaling.get( + "rope_type", rope_scaling.get("type", "default") ) else: @@ -452,8 +453,8 @@ def __init__( # when we want to create a local RoPE layer. Config defaults should hold values for global RoPE config = copy.deepcopy(config) - config.rope_theta = config.rope_local_base_freq - config.rope_scaling = {"rope_type": "default"} + config.rope_parameters["rope_theta"] = config.rope_local_base_freq + config.rope_parameters["rope_scaling"] = {"rope_type": "default"} self.rotary_emb_local = Gemma3RotaryEmbedding(config=config) self.layers = make_layers( diff --git a/python/sglang/srt/models/gemma3n_causal.py b/python/sglang/srt/models/gemma3n_causal.py index 0f710b0f8741..5acd112a90ed 100644 --- a/python/sglang/srt/models/gemma3n_causal.py +++ b/python/sglang/srt/models/gemma3n_causal.py @@ -3,7 +3,6 @@ import torch import torch.nn.functional as F from torch import nn -from transformers import AutoModel, Gemma3nTextConfig, PretrainedConfig, PreTrainedModel from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import GeluAndMul @@ -26,6 +25,7 @@ ) from sglang.srt.models.gemma3_causal import Gemma3TextScaledWordEmbedding from sglang.srt.utils import add_prefix, make_layers +from transformers import AutoModel, Gemma3nTextConfig, PretrainedConfig, PreTrainedModel # Aligned with HF's implementation, using sliding window inclusive with the last token @@ -396,8 +396,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, - rope_scaling=config.rope_scaling, + base=config.rope_parameters.get("rope_theta", 10000), + rope_scaling=config.rope_parameters.get("rope_scaling"), ) self.sliding_window = config.sliding_window if self.is_sliding else None diff --git a/python/sglang/srt/models/glm4.py b/python/sglang/srt/models/glm4.py index ba40a1f7446a..1298831f8d5e 100644 --- a/python/sglang/srt/models/glm4.py +++ b/python/sglang/srt/models/glm4.py @@ -217,20 +217,9 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - - rp = getattr(config, "rope_parameters", None) - if isinstance(rp, dict): - rope_theta = rp.get("rope_theta", getattr(config, "rope_theta", 1000000)) - partial_rotary_factor = rp.get( - "partial_rotary_factor", - getattr(config, "partial_rotary_factor", 0.5), - ) - rope_scaling = getattr(config, "rope_scaling", None) - else: - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) - partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5) - + rope_theta = config.rope_parameters.get("rope_theta", 1000000) + rope_scaling = config.rope_parameters.get("rope_scaling") + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 0.5) bias = getattr(config, "attention_bias", True) max_position_embeddings = getattr(config, "max_position_embeddings", 32768) head_dim = getattr(config, "head_dim", None) diff --git a/python/sglang/srt/models/glm4_moe.py b/python/sglang/srt/models/glm4_moe.py index e3eeb7d10b64..607a4630b9aa 100644 --- a/python/sglang/srt/models/glm4_moe.py +++ b/python/sglang/srt/models/glm4_moe.py @@ -20,7 +20,6 @@ import torch import torch.nn.functional as F from torch import nn -from transformers import PretrainedConfig from sglang.srt.batch_overlap.two_batch_overlap import model_forward_maybe_tbo from sglang.srt.distributed import ( @@ -94,6 +93,7 @@ log_info_on_rank0, make_layers, ) +from transformers import PretrainedConfig _is_hip = is_hip() _is_cuda = is_cuda() @@ -678,8 +678,8 @@ def __init__( nn.Module.__init__(self) self.hidden_size = config.hidden_size self.config = config - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") partial_rotary_factor = getattr( getattr(config, "rope_parameters", None), "partial_rotary_factor", None ) or getattr(config, "partial_rotary_factor", 0.5) diff --git a/python/sglang/srt/models/gpt_oss.py b/python/sglang/srt/models/gpt_oss.py index 2cf813bced0f..0ec575ac95cf 100644 --- a/python/sglang/srt/models/gpt_oss.py +++ b/python/sglang/srt/models/gpt_oss.py @@ -23,7 +23,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.compilation.piecewise_context_manager import ( get_forward_context, @@ -77,6 +76,7 @@ from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import LazyValue, add_prefix, is_cuda, is_npu, make_layers from sglang.srt.utils.custom_op import register_custom_op +from transformers import PretrainedConfig _is_cuda = is_cuda() _is_npu = is_npu() @@ -379,8 +379,8 @@ def __init__( super().__init__() self.config = config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 8192) head_dim = getattr( config, "head_dim", config.hidden_size // config.num_attention_heads diff --git a/python/sglang/srt/models/granite.py b/python/sglang/srt/models/granite.py index 19252dc8db62..549a40bd1b2b 100644 --- a/python/sglang/srt/models/granite.py +++ b/python/sglang/srt/models/granite.py @@ -21,7 +21,6 @@ import torch from torch import nn -from transformers import GraniteConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul @@ -44,6 +43,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix from sglang.utils import get_exception_traceback +from transformers import GraniteConfig logger = logging.getLogger(__name__) @@ -187,8 +187,8 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size self.residual_multiplier = config.residual_multiplier - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): diff --git a/python/sglang/srt/models/granitemoe.py b/python/sglang/srt/models/granitemoe.py index d65b9ec06d31..5b3c8b6678d4 100644 --- a/python/sglang/srt/models/granitemoe.py +++ b/python/sglang/srt/models/granitemoe.py @@ -4,7 +4,6 @@ import torch from torch import nn -from transformers import GraniteConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.layernorm import RMSNorm @@ -27,6 +26,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.models import mixtral from sglang.srt.utils import add_prefix +from transformers import GraniteConfig class GraniteMoeMoE(nn.Module): @@ -187,7 +187,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) + rope_theta = config.rope_parameters.get("rope_theta", 10000) self.self_attn = GraniteMoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 85155719aeed..a6cdc940246d 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -19,7 +19,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -60,6 +59,7 @@ from sglang.srt.model_loader.loader import DefaultModelLoader from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix +from transformers import PretrainedConfig logger = logging.getLogger(__name__) @@ -453,7 +453,7 @@ def __init__( self.layer_id = layer_id self.alt_stream = alt_stream or torch.cuda.Stream() - rope_theta = getattr(config, "rope_theta", 10000) + rope_theta = config.rope_parameters.get("rope_theta", 10000) self.self_attn = Grok1Attention( config=config, hidden_size=self.hidden_size, diff --git a/python/sglang/srt/models/hunyuan.py b/python/sglang/srt/models/hunyuan.py index 300493a3f1e8..8fd024a25583 100644 --- a/python/sglang/srt/models/hunyuan.py +++ b/python/sglang/srt/models/hunyuan.py @@ -17,7 +17,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -52,6 +51,7 @@ maybe_remap_kv_scale_name, ) from sglang.srt.utils import is_hip +from transformers import PretrainedConfig expert_distribution_recorder = ExpertDistributionRecorder() @@ -401,8 +401,8 @@ def __init__( if isinstance(config.intermediate_size, int) else config.intermediate_size[layer_id] ) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): diff --git a/python/sglang/srt/models/internlm2.py b/python/sglang/srt/models/internlm2.py index 7e0956ff090f..d243799a90c0 100644 --- a/python/sglang/srt/models/internlm2.py +++ b/python/sglang/srt/models/internlm2.py @@ -18,7 +18,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul @@ -39,6 +38,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix +from transformers import PretrainedConfig class InternLM2MLP(nn.Module): @@ -173,8 +173,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.attention = InternLM2Attention( hidden_size=self.hidden_size, diff --git a/python/sglang/srt/models/iquest_loopcoder.py b/python/sglang/srt/models/iquest_loopcoder.py index 240aa5306a29..e2181c7d005d 100644 --- a/python/sglang/srt/models/iquest_loopcoder.py +++ b/python/sglang/srt/models/iquest_loopcoder.py @@ -18,7 +18,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.layernorm import RMSNorm @@ -39,6 +38,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.llama import LlamaMLP as LoopCoderMLP from sglang.srt.utils import add_prefix, make_layers +from transformers import PretrainedConfig logger = logging.getLogger(__name__) @@ -166,8 +166,8 @@ def __init__( prefix=add_prefix("o_proj", prefix), ) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr( config, "max_position_embeddings", max_position ) diff --git a/python/sglang/srt/models/jet_nemotron.py b/python/sglang/srt/models/jet_nemotron.py index 513f2ce3759a..47c82908f22b 100644 --- a/python/sglang/srt/models/jet_nemotron.py +++ b/python/sglang/srt/models/jet_nemotron.py @@ -374,8 +374,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.config.max_position_embeddings, - base=int(self.config.rope_theta), - rope_scaling=self.config.rope_scaling, + base=int(self.config.rope_parameters.get("rope_theta", 10000)), + rope_scaling=self.config.rope_parameters.get("rope_scaling"), ) match self.config.layer_types[layer_id]: diff --git a/python/sglang/srt/models/lfm2.py b/python/sglang/srt/models/lfm2.py index 8a271c33606e..cfd0ffd873c0 100644 --- a/python/sglang/srt/models/lfm2.py +++ b/python/sglang/srt/models/lfm2.py @@ -124,13 +124,13 @@ def __init__( if rope_parameters is not None and "rope_theta" in rope_parameters: rope_theta = rope_parameters["rope_theta"] else: - rope_theta = getattr(config, "rope_theta", 10000) + rope_theta = config.rope_parameters.get("rope_theta", 10000) self.rotary_emb = get_rope( head_size=self.head_dim, rotary_dim=self.head_dim, max_position=getattr(config, "max_position_embeddings", 8192), - rope_scaling=getattr(config, "rope_scaling", None), + rope_scaling=config.rope_parameters.get("rope_scaling"), base=rope_theta, is_neox_style=True, dtype=torch.get_default_dtype(), @@ -424,10 +424,10 @@ def forward( input_ids: torch.Tensor, positions: torch.Tensor, forward_batch: ForwardBatch, - inputs_embeds: Optional[torch.Tensor] = None, + input_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: hidden_states = ( - inputs_embeds if inputs_embeds is not None else self.embed_tokens(input_ids) + input_embeds if input_embeds is not None else self.embed_tokens(input_ids) ) residual = None @@ -474,16 +474,19 @@ def __init__( def get_num_kv_cache_layers(self) -> int: return self.num_attention_layers + def get_input_embeddings(self): + return self.model.embed_tokens + @torch.no_grad() def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, forward_batch: ForwardBatch, - inputs_embeds: Optional[torch.Tensor] = None, + input_embeds: Optional[torch.Tensor] = None, **kwargs, ): - hidden_states = self.model(input_ids, positions, forward_batch, inputs_embeds) + hidden_states = self.model(input_ids, positions, forward_batch, input_embeds) return self.logits_processor( input_ids, hidden_states, self.lm_head, forward_batch ) diff --git a/python/sglang/srt/models/lfm2_vl.py b/python/sglang/srt/models/lfm2_vl.py new file mode 100644 index 000000000000..99083a56320a --- /dev/null +++ b/python/sglang/srt/models/lfm2_vl.py @@ -0,0 +1,281 @@ +# Copyright 2025 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Inference-only LFM2-VL model compatible with HuggingFace weights.""" + +import logging +from typing import Iterable, List, Optional, Tuple + +import torch +from torch import nn +from transformers import PreTrainedModel +from transformers.activations import ACT2FN +from transformers.models.auto.modeling_auto import AutoModel + +from sglang.srt.configs.lfm2_vl import Lfm2VlConfig +from sglang.srt.layers.logits_processor import LogitsProcessor +from sglang.srt.layers.quantization.base_config import QuantizationConfig +from sglang.srt.managers.mm_utils import ( + MultiModalityDataPaddingPatternMultimodalTokens, + general_mm_embed_routine, +) +from sglang.srt.managers.schedule_batch import ( + MultimodalDataItem, + MultimodalInputs, + flatten_nested_list, +) +from sglang.srt.model_executor.forward_batch_info import ForwardBatch +from sglang.srt.model_loader.weight_utils import default_weight_loader +from sglang.srt.models.lfm2 import Lfm2ForCausalLM +from sglang.srt.utils import add_prefix + +logger = logging.getLogger(__name__) + + +class Lfm2VlMultiModalProjector(nn.Module): + """Multimodal projector with pixel unshuffle downsampling.""" + + def __init__(self, config: Lfm2VlConfig): + super().__init__() + in_channels = config.vision_config.hidden_size * (config.downsample_factor**2) + self.factor = config.downsample_factor + self.use_layer_norm = config.projector_use_layernorm + self.layer_norm = ( + nn.LayerNorm(in_channels) if config.projector_use_layernorm else None + ) + self.linear_1 = nn.Linear( + in_channels, + config.projector_hidden_size, + bias=config.projector_bias, + ) + self.act = ACT2FN[config.projector_hidden_act] + self.linear_2 = nn.Linear( + config.projector_hidden_size, + config.text_config.hidden_size, + bias=config.projector_bias, + ) + + def forward(self, image_features: torch.Tensor): + image_features = self.pixel_unshuffle(image_features) + if self.use_layer_norm: + image_features = self.layer_norm(image_features) + hidden_states = self.linear_1(image_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + def pixel_unshuffle(self, hidden_states: torch.Tensor): + batch_size, width, height, channels = hidden_states.size() + hidden_states = hidden_states.reshape( + batch_size, width, height // self.factor, channels * self.factor + ) + hidden_states = hidden_states.permute(0, 2, 1, 3) + hidden_states = hidden_states.reshape( + batch_size, + height // self.factor, + width // self.factor, + channels * self.factor**2, + ) + hidden_states = hidden_states.permute(0, 2, 1, 3) + return hidden_states + + +class Lfm2VlForConditionalGeneration(PreTrainedModel): + config_class = Lfm2VlConfig + + def __init__( + self, + config: Lfm2VlConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__(config=config) + self.config = config + self.quant_config = quant_config + + # Vision tower: SigLip2 via HF AutoModel + self.vision_tower = AutoModel.from_config(config=config.vision_config) + + # Multimodal projector + self.multi_modal_projector = Lfm2VlMultiModalProjector(config) + + # Language model: reuse sglang's LFM2 implementation + self.language_model = Lfm2ForCausalLM( + config.text_config, + quant_config=quant_config, + prefix=add_prefix("language_model", prefix), + ) + + self.logits_processor = LogitsProcessor(config.text_config) + self.post_init() + + def pad_input_ids( + self, input_ids: List[int], mm_inputs: MultimodalInputs + ) -> List[int]: + pattern = MultiModalityDataPaddingPatternMultimodalTokens() + result = pattern.pad_input_tokens(input_ids, mm_inputs) + return result + + def get_input_embeddings(self) -> nn.Embedding: + return self.language_model.model.embed_tokens + + def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: + """Process images through vision tower and projector. + + Handles SigLip2's NaFlex variable-resolution output by unpadding + features using the attention mask and reshaping per spatial_shapes. + """ + all_pixel_values = flatten_nested_list([item.feature for item in items]) + all_pixel_attention_masks = flatten_nested_list( + [item.pixel_attention_mask for item in items] + ) + all_spatial_shapes = flatten_nested_list( + [item.spatial_shapes for item in items] + ) + + image_features_list = [] + + for pixel_values_batch, attn_mask_batch, shapes_batch in zip( + all_pixel_values, all_pixel_attention_masks, all_spatial_shapes + ): + # Normalize shapes + if pixel_values_batch.dim() == 2: + pixel_values_batch = pixel_values_batch.unsqueeze(0) + if attn_mask_batch.dim() == 1: + attn_mask_batch = attn_mask_batch.unsqueeze(0) + if shapes_batch.dim() == 1: + shapes_batch = shapes_batch.unsqueeze(0) + + pixel_values_batch = pixel_values_batch.to( + device=self.vision_tower.device, + dtype=self.vision_tower.dtype, + ) + attn_mask_batch = attn_mask_batch.to(device=self.vision_tower.device) + shapes_batch = shapes_batch.to(device=self.vision_tower.device) + + # Forward through SigLip2 vision tower + vision_outputs = self.vision_tower( + pixel_values=pixel_values_batch, + spatial_shapes=shapes_batch, + pixel_attention_mask=attn_mask_batch, + return_dict=True, + ) + last_hidden_state = vision_outputs.last_hidden_state + + # Unpad and project each image + img_feature_lengths = attn_mask_batch.sum(dim=1) + batch_size = last_hidden_state.size(0) + + for img_idx in range(batch_size): + feature = last_hidden_state[img_idx] + # Unpad: keep only non-padded tokens + feat_len = img_feature_lengths[img_idx].item() + feature = feature[:feat_len, :].unsqueeze(0) + + # Reshape to spatial dimensions (1, H, W, C) + h, w = shapes_batch[img_idx].tolist() + feature = feature.reshape(1, int(h), int(w), -1) + + # Project through multimodal projector + img_embedding = self.multi_modal_projector(feature) + + # Flatten to (num_tokens, hidden_size) + img_embedding = img_embedding.reshape(-1, img_embedding.size(-1)) + image_features_list.append(img_embedding) + + if image_features_list: + return torch.cat(image_features_list, dim=0) + + return torch.tensor( + [], device=self.vision_tower.device, dtype=self.vision_tower.dtype + ) + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + forward_batch: ForwardBatch, + input_embeds: torch.Tensor = None, + **kwargs, + ): + hidden_states = general_mm_embed_routine( + input_ids=input_ids, + forward_batch=forward_batch, + language_model=self.language_model, + multimodal_model=self, + positions=positions, + ) + return hidden_states + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + # Collect weights by destination + vision_weights = [] + projector_weights = [] + lm_weights = [] + + for name, loaded_weight in weights: + if name.startswith("model.vision_tower."): + # model.vision_tower.* → vision_tower.* + new_name = name.replace("model.vision_tower.", "vision_tower.", 1) + vision_weights.append((new_name, loaded_weight)) + elif name.startswith("model.multi_modal_projector."): + # model.multi_modal_projector.* → multi_modal_projector.* + new_name = name.replace( + "model.multi_modal_projector.", "multi_modal_projector.", 1 + ) + projector_weights.append((new_name, loaded_weight)) + elif name.startswith("model.language_model."): + # model.language_model.* → language_model.model.* + new_name = name.replace( + "model.language_model.", "language_model.model.", 1 + ) + lm_weights.append((new_name, loaded_weight)) + elif name.startswith("lm_head."): + # lm_head.* → language_model.lm_head.* + new_name = name.replace("lm_head.", "language_model.lm_head.", 1) + lm_weights.append((new_name, loaded_weight)) + else: + # Try direct mapping + lm_weights.append((name, loaded_weight)) + + params_dict = dict(self.named_parameters()) + + # Load vision tower weights + for name, loaded_weight in vision_weights: + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + + # Load projector weights + for name, loaded_weight in projector_weights: + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + + # Load language model weights via Lfm2ForCausalLM.load_weights + # Strip the "language_model." prefix since Lfm2ForCausalLM expects + # names like "model.layers.0..." and "lm_head.weight" + lm_weights_stripped = [] + for name, loaded_weight in lm_weights: + if name.startswith("language_model."): + name = name[len("language_model.") :] + lm_weights_stripped.append((name, loaded_weight)) + self.language_model.load_weights(lm_weights_stripped) + + +EntryClass = Lfm2VlForConditionalGeneration diff --git a/python/sglang/srt/models/llada2.py b/python/sglang/srt/models/llada2.py index 7094be5c53c9..5bff2247b7cf 100644 --- a/python/sglang/srt/models/llada2.py +++ b/python/sglang/srt/models/llada2.py @@ -24,7 +24,6 @@ import torch import torch.nn.functional as F from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import ( get_pp_group, @@ -77,6 +76,7 @@ ) from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty, make_layers +from transformers import PretrainedConfig LoraConfig = None logger = logging.getLogger(__name__) @@ -477,8 +477,8 @@ def __init__( self.head_dim, rotary_dim=self.rotary_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, - rope_scaling=config.rope_scaling, + base=config.rope_parameters.get("rope_theta", 10000), + rope_scaling=config.rope_parameters.get("rope_scaling"), ) self.attn = RadixAttention( diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py index 01e934dcc096..5893fddc4008 100644 --- a/python/sglang/srt/models/llama.py +++ b/python/sglang/srt/models/llama.py @@ -21,7 +21,6 @@ import torch from torch import nn -from transformers import LlamaConfig from sglang.srt.distributed import ( get_pp_group, @@ -54,6 +53,7 @@ from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import add_prefix, is_npu, make_layers from sglang.utils import get_exception_traceback +from transformers import LlamaConfig logger = logging.getLogger(__name__) _is_npu = is_npu() @@ -252,8 +252,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): diff --git a/python/sglang/srt/models/llama4.py b/python/sglang/srt/models/llama4.py index 2d3a4fc10257..5e6287a448f8 100644 --- a/python/sglang/srt/models/llama4.py +++ b/python/sglang/srt/models/llama4.py @@ -21,7 +21,6 @@ import torch from torch import nn -from transformers import Llama4TextConfig from sglang.srt.distributed import ( get_tensor_model_parallel_world_size, @@ -59,6 +58,7 @@ make_layers, ) from sglang.srt.utils.common import get_current_device_stream_fast +from transformers import Llama4TextConfig _is_cuda = is_cuda() @@ -362,8 +362,8 @@ def __init__( super().__init__() self.layer_id = layer_id self.hidden_size = config.hidden_size - rope_theta = config.rope_theta - rope_scaling = config.rope_scaling + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = config.max_position_embeddings self.attn_tp_size = get_attention_tp_size() self.attn_tp_rank = get_attention_tp_rank() diff --git a/python/sglang/srt/models/llama_eagle3.py b/python/sglang/srt/models/llama_eagle3.py index 49f938a1c5fe..550be8f74a1e 100644 --- a/python/sglang/srt/models/llama_eagle3.py +++ b/python/sglang/srt/models/llama_eagle3.py @@ -24,7 +24,6 @@ import torch from torch import nn -from transformers import LlamaConfig from sglang.srt.distributed import get_pp_group from sglang.srt.layers.layernorm import RMSNorm @@ -38,6 +37,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.llama import LlamaDecoderLayer, LlamaForCausalLM, LlamaMLP +from transformers import LlamaConfig class LlamaDecoderLayer(LlamaDecoderLayer): @@ -111,14 +111,13 @@ def __init__( super().__init__() self.config = config + rope_scaling = config.rope_parameters.get("rope_scaling") self.is_mrope_enabled = ( - hasattr(config, "rope_scaling") - and config.rope_scaling is not None - and "mrope_section" in config.rope_scaling + rope_scaling is not None and "mrope_section" in rope_scaling ) # fix rope_scaling for qwen2.5-vl if self.is_mrope_enabled: - config.rope_scaling["rope_type"] = "default" + config.rope_parameters["rope_scaling"]["rope_type"] = "default" self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( diff --git a/python/sglang/srt/models/longcat_flash.py b/python/sglang/srt/models/longcat_flash.py index c16a93797309..8aa4f296817b 100644 --- a/python/sglang/srt/models/longcat_flash.py +++ b/python/sglang/srt/models/longcat_flash.py @@ -328,7 +328,7 @@ def __init__( v_head_dim=config.v_head_dim, q_lora_rank=config.q_lora_rank, kv_lora_rank=config.kv_lora_rank, - rope_theta=config.rope_theta, + rope_theta=config.rope_parameters.get("rope_theta", 10000), rope_scaling=None, max_position_embeddings=config.max_position_embeddings, quant_config=( diff --git a/python/sglang/srt/models/longcat_flash_nextn.py b/python/sglang/srt/models/longcat_flash_nextn.py index 12c9cb13fae9..f9484c7092a9 100644 --- a/python/sglang/srt/models/longcat_flash_nextn.py +++ b/python/sglang/srt/models/longcat_flash_nextn.py @@ -132,7 +132,7 @@ def __init__( v_head_dim=config.v_head_dim, q_lora_rank=config.q_lora_rank, kv_lora_rank=config.kv_lora_rank, - rope_theta=config.rope_theta, + rope_theta=config.rope_parameters.get("rope_theta", 10000), rope_scaling=None, max_position_embeddings=config.max_position_embeddings, quant_config=quant_config, diff --git a/python/sglang/srt/models/midashenglm.py b/python/sglang/srt/models/midashenglm.py index 2698fd724edc..5afa3ba6f559 100644 --- a/python/sglang/srt/models/midashenglm.py +++ b/python/sglang/srt/models/midashenglm.py @@ -7,7 +7,6 @@ import torch import torch.nn as nn import torchaudio.functional as F -from transformers import PretrainedConfig from sglang.srt.layers.attention.vision import VisionAttention from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear @@ -25,6 +24,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.qwen2 import Qwen2ForCausalLM from sglang.srt.utils import add_prefix +from transformers import PretrainedConfig logger = logging.getLogger(__name__) _Tuple2: TypeAlias = int | tuple[int, int] | Sequence[int] @@ -475,18 +475,14 @@ def __init__( ) -> None: super().__init__() self.config = config - if ( - hasattr(config.text_config, "rope_scaling") - and config.text_config.rope_scaling - ): - if "mrope_section" in config.text_config.rope_scaling: + rope_scaling = config.text_config.rope_parameters.get("rope_scaling") + if rope_scaling: + if "mrope_section" in rope_scaling: new_rope_scaling = { - k: v - for k, v in config.text_config.rope_scaling.items() - if k != "mrope_section" + k: v for k, v in rope_scaling.items() if k != "mrope_section" } - config.text_config.rope_scaling = ( + config.text_config.rope_parameters["rope_scaling"] = ( new_rope_scaling if new_rope_scaling else None ) self.audio_encoder = DashengAudioTransformer( diff --git a/python/sglang/srt/models/mimo_v2_flash.py b/python/sglang/srt/models/mimo_v2_flash.py index d6f5eb07f4de..7469f7ade040 100644 --- a/python/sglang/srt/models/mimo_v2_flash.py +++ b/python/sglang/srt/models/mimo_v2_flash.py @@ -573,8 +573,8 @@ def __init__( self.hidden_size = config.hidden_size self.layer_id = layer_id - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 1000000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 32768) if self.is_swa_layer(): @@ -591,7 +591,7 @@ def __init__( config, "add_swa_attention_sink_bias", False ), layer_id=layer_id, - rope_theta=getattr(config, "swa_rope_theta", rope_theta), + rope_theta=config.rope_parameters.get("swa_rope_theta", rope_theta), rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, diff --git a/python/sglang/srt/models/mimo_v2_flash_nextn.py b/python/sglang/srt/models/mimo_v2_flash_nextn.py index 18b5453953c0..b43b68c6713a 100644 --- a/python/sglang/srt/models/mimo_v2_flash_nextn.py +++ b/python/sglang/srt/models/mimo_v2_flash_nextn.py @@ -17,7 +17,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder @@ -46,6 +45,7 @@ ) from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import add_prefix +from transformers import PretrainedConfig MiMoV2FlashConfig = None @@ -64,8 +64,8 @@ def __init__( self.config = config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 1000000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 32768) self.self_attn = MiMoV2Attention( @@ -79,7 +79,7 @@ def __init__( attention_bias=config.attention_bias, attention_sink_bias=getattr(config, "add_swa_attention_sink_bias", False), layer_id=layer_id, - rope_theta=getattr(config, "swa_rope_theta", rope_theta), + rope_theta=config.rope_parameters.get("swa_rope_theta", rope_theta), rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, diff --git a/python/sglang/srt/models/minicpm.py b/python/sglang/srt/models/minicpm.py index e7c94c85d0b2..68bbe651e69d 100644 --- a/python/sglang/srt/models/minicpm.py +++ b/python/sglang/srt/models/minicpm.py @@ -176,8 +176,8 @@ def __init__( super().__init__() self.config = config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = MiniCPMAttention( hidden_size=self.hidden_size, diff --git a/python/sglang/srt/models/minicpm3.py b/python/sglang/srt/models/minicpm3.py index 9755a6f6b218..e975596b8d97 100644 --- a/python/sglang/srt/models/minicpm3.py +++ b/python/sglang/srt/models/minicpm3.py @@ -18,7 +18,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul @@ -40,6 +39,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, is_cuda +from transformers import PretrainedConfig if is_cuda(): from sgl_kernel import bmm_fp8 @@ -280,8 +280,8 @@ def __init__( super().__init__() self.config = config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = MiniCPM3AttentionMLA( config=config, diff --git a/python/sglang/srt/models/minimax_m2.py b/python/sglang/srt/models/minimax_m2.py index 827f60e0f469..e32d979617ce 100644 --- a/python/sglang/srt/models/minimax_m2.py +++ b/python/sglang/srt/models/minimax_m2.py @@ -23,7 +23,6 @@ import triton import triton.language as tl from torch import nn -from transformers import PretrainedConfig from sglang.srt.batch_overlap.two_batch_overlap import model_forward_maybe_tbo from sglang.srt.distributed import ( @@ -72,6 +71,7 @@ is_non_idle_and_non_empty, make_layers, ) +from transformers import PretrainedConfig logger = logging.getLogger(__name__) @@ -566,7 +566,7 @@ def __init__( self.scaling = self.head_dim**-0.5 # RoPE settings - support partial RoPE - self.rope_theta = getattr(config, "rope_theta", 10000) + self.rope_theta = config.rope_parameters.get("rope_theta", 10000) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_dim = getattr( config, "rotary_dim", self.head_dim @@ -596,7 +596,7 @@ def __init__( ) # Setup RoPE with partial rotary dimension - rope_scaling = getattr(config, "rope_scaling", None) + rope_scaling = config.rope_parameters.get("rope_scaling") self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.rotary_dim, # Use partial rotary dimension diff --git a/python/sglang/srt/models/ministral3.py b/python/sglang/srt/models/ministral3.py index 460c7b30fb5e..3b01f011aba5 100644 --- a/python/sglang/srt/models/ministral3.py +++ b/python/sglang/srt/models/ministral3.py @@ -1,7 +1,6 @@ from typing import Any, Dict, Optional import torch -from transformers import PretrainedConfig from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.model_executor.forward_batch_info import ForwardBatch @@ -12,6 +11,7 @@ LlamaModel, ) from sglang.srt.utils import add_prefix, make_layers +from transformers import PretrainedConfig def _get_llama_4_attn_scale( @@ -54,11 +54,7 @@ def __init__( bias, ) # Ministral3 specific: llama 4 style scaling beta - self.llama_4_scaling_beta = None - if hasattr(config, "rope_parameters") and config.rope_parameters: - self.llama_4_scaling_beta = config.rope_parameters.get( - "llama_4_scaling_beta" - ) + self.llama_4_scaling_beta = config.rope_parameters.get("llama_4_scaling_beta") # sliding window self.sliding_window = getattr(config, "sliding_window", None) @@ -107,12 +103,8 @@ def __init__(self, config, layer_id=0, quant_config=None, prefix=""): num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, layer_id=layer_id, - rope_theta=getattr(config, "rope_parameters", {}).get( - "rope_theta", 1000000.0 - ), - rope_scaling=getattr( - config, "rope_parameters", {} - ), # rope_scaling is rope_parameters in Ministral3Config + rope_theta=config.rope_parameters.get("rope_theta", 1000000.0), + rope_scaling=config.rope_parameters, # rope_scaling is rope_parameters in Ministral3Config max_position_embeddings=getattr( config, "original_max_position_embeddings", 16384 ), diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index c4f3e4c446f7..f658637635db 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -21,7 +21,6 @@ import torch from torch import nn -from transformers import MixtralConfig from sglang.srt.distributed import ( get_pp_group, @@ -48,6 +47,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, make_layers +from transformers import MixtralConfig logger = logging.getLogger(__name__) @@ -208,7 +208,7 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) + rope_theta = config.rope_parameters.get("rope_theta", 10000) self.self_attn = MixtralAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py index 5b84c90ddf78..d22dbab9d4e5 100644 --- a/python/sglang/srt/models/mixtral_quant.py +++ b/python/sglang/srt/models/mixtral_quant.py @@ -22,7 +22,6 @@ import torch import torch.nn.functional as F from torch import nn -from transformers import MixtralConfig from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -46,6 +45,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix +from transformers import MixtralConfig class MixtralMLP(nn.Module): @@ -261,7 +261,7 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) + rope_theta = config.rope_parameters.get("rope_theta", 10000) self.self_attn = MixtralAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, diff --git a/python/sglang/srt/models/mllama4.py b/python/sglang/srt/models/mllama4.py index 0913f9adfd2d..1bc23f128878 100644 --- a/python/sglang/srt/models/mllama4.py +++ b/python/sglang/srt/models/mllama4.py @@ -8,11 +8,6 @@ import torch from torch import nn -from transformers import Llama4Config, Llama4VisionConfig -from transformers.models.llama4.modeling_llama4 import ( - Llama4MultiModalProjector, - vision_apply_rotary_emb, -) from sglang.srt.layers.attention.vision import VisionAttention from sglang.srt.layers.linear import ( @@ -35,6 +30,11 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import is_cpu +from transformers import Llama4Config, Llama4VisionConfig +from transformers.models.llama4.modeling_llama4 import ( + Llama4MultiModalProjector, + vision_apply_rotary_emb, +) _is_cpu = is_cpu() @@ -305,7 +305,7 @@ def __init__(self, config): frequencies_y = img_idx // idx # get the coordinates of the 2d matrix along y freq_dim = config.hidden_size // config.num_attention_heads // 2 rope_freq = 1.0 / ( - config.rope_theta + config.rope_parameters.get("rope_theta", 10000) ** (torch.arange(0, freq_dim, 2)[: (freq_dim // 2)].float() / freq_dim) ) freqs_x = ( diff --git a/python/sglang/srt/models/nemotron_nas.py b/python/sglang/srt/models/nemotron_nas.py index ebf49f95a4aa..b1402ecc5d6d 100644 --- a/python/sglang/srt/models/nemotron_nas.py +++ b/python/sglang/srt/models/nemotron_nas.py @@ -18,7 +18,6 @@ import torch from torch import nn -from transformers import LlamaConfig from sglang.srt.distributed import get_pp_group from sglang.srt.layers.layernorm import RMSNorm @@ -39,6 +38,7 @@ from sglang.srt.models.llama import LlamaAttention, LlamaMLP from sglang.srt.utils import add_prefix, make_layers from sglang.utils import logger +from transformers import LlamaConfig def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: @@ -69,8 +69,8 @@ def __init__( self._is_no_op_ffn = block_config.ffn.no_op self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): diff --git a/python/sglang/srt/models/olmo.py b/python/sglang/srt/models/olmo.py index 0c1ecf85700a..7214327a37fc 100644 --- a/python/sglang/srt/models/olmo.py +++ b/python/sglang/srt/models/olmo.py @@ -19,7 +19,6 @@ import torch from torch import nn -from transformers import OlmoConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul @@ -39,6 +38,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, make_layers +from transformers import OlmoConfig class OlmoAttention(nn.Module): @@ -67,7 +67,7 @@ def __init__( self.num_heads = self.total_num_heads // tensor_model_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.rope_theta = config.rope_parameters.get("rope_theta", 10000) self.clip_qkv = config.clip_qkv # Attention input projection. Projects x -> (q, k, v) diff --git a/python/sglang/srt/models/olmo2.py b/python/sglang/srt/models/olmo2.py index 8789a3477f40..e43cc56b6fea 100644 --- a/python/sglang/srt/models/olmo2.py +++ b/python/sglang/srt/models/olmo2.py @@ -20,7 +20,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -47,6 +46,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, is_cuda, make_layers +from transformers import PretrainedConfig _is_cuda = is_cuda() @@ -98,7 +98,7 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.rope_theta = config.rope_parameters.get("rope_theta", 10000) # Attention input projection. Projects x -> (q, k, v) self.qkv_proj = QKVParallelLinear( diff --git a/python/sglang/srt/models/olmoe.py b/python/sglang/srt/models/olmoe.py index a74a2968daef..216fd456a7b1 100644 --- a/python/sglang/srt/models/olmoe.py +++ b/python/sglang/srt/models/olmoe.py @@ -21,7 +21,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.layernorm import RMSNorm @@ -43,6 +42,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, make_layers, print_warning_once +from transformers import PretrainedConfig class OlmoeMoE(nn.Module): @@ -204,8 +204,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 4096) self.self_attn = OlmoeAttention( diff --git a/python/sglang/srt/models/orion.py b/python/sglang/srt/models/orion.py index cc444d39461c..262e01f0fa12 100644 --- a/python/sglang/srt/models/orion.py +++ b/python/sglang/srt/models/orion.py @@ -12,7 +12,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.distributed.parallel_state import get_pp_group @@ -34,6 +33,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, make_layers +from transformers import PretrainedConfig class OrionMLP(nn.Module): @@ -164,8 +164,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = OrionAttention( hidden_size=self.hidden_size, diff --git a/python/sglang/srt/models/persimmon.py b/python/sglang/srt/models/persimmon.py index 5f8885e716e5..faa62b69a9b6 100644 --- a/python/sglang/srt/models/persimmon.py +++ b/python/sglang/srt/models/persimmon.py @@ -3,7 +3,6 @@ import torch from torch import nn -from transformers import PersimmonConfig from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size from sglang.srt.layers.activation import get_act_fn @@ -24,6 +23,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, make_layers +from transformers import PersimmonConfig class PersimmonMLP(nn.Module): @@ -65,7 +65,7 @@ def __init__( self.num_heads = self.total_num_heads // tensor_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.rope_theta = config.rope_parameters.get("rope_theta", 10000) self.partial_rotary_factor = config.partial_rotary_factor self.is_causal = True diff --git a/python/sglang/srt/models/phi.py b/python/sglang/srt/models/phi.py index 5679bc987812..21cd4d9d69d5 100644 --- a/python/sglang/srt/models/phi.py +++ b/python/sglang/srt/models/phi.py @@ -3,7 +3,6 @@ import torch from torch import nn -from transformers import PhiConfig from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size from sglang.srt.layers.activation import get_act_fn @@ -23,6 +22,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, make_layers +from transformers import PhiConfig class PhiAttention(nn.Module): @@ -63,7 +63,7 @@ def __init__( ) assert rotary_dim % 2 == 0 - rope_theta = getattr(config, "rope_theta", 10000.0) + rope_theta = config.rope_parameters.get("rope_theta", 10000.0) max_position_embeddings = getattr(config, "max_position_embeddings", 2048) self.rotary_emb = get_rope( self.head_size, diff --git a/python/sglang/srt/models/phi3_small.py b/python/sglang/srt/models/phi3_small.py index 9ac855c492f6..8ab60f7c2ceb 100644 --- a/python/sglang/srt/models/phi3_small.py +++ b/python/sglang/srt/models/phi3_small.py @@ -3,8 +3,6 @@ import torch from torch import nn -from transformers import Phi3Config -from transformers.configuration_utils import PretrainedConfig from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size from sglang.srt.layers.linear import ( @@ -26,6 +24,8 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, make_layers +from transformers import Phi3Config +from transformers.configuration_utils import PretrainedConfig @torch.jit.script @@ -153,8 +153,8 @@ def __init__( prefix=add_prefix("o_proj", prefix), ) - if getattr(self.config, "rope_scaling", None) is not None: - rope_scaling = self.config.rope_scaling + rope_scaling = self.config.rope_parameters.get("rope_scaling") + if rope_scaling is not None: for key in rope_scaling: if isinstance(rope_scaling[key], list): rope_scaling[key] = tuple(rope_scaling[key]) diff --git a/python/sglang/srt/models/phimoe.py b/python/sglang/srt/models/phimoe.py index 0d147c2b1783..d1b95e617bb3 100644 --- a/python/sglang/srt/models/phimoe.py +++ b/python/sglang/srt/models/phimoe.py @@ -2,7 +2,6 @@ import torch from torch import nn -from transformers.configuration_utils import PretrainedConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size @@ -29,6 +28,7 @@ maybe_remap_kv_scale_name, ) from sglang.srt.utils import add_prefix, make_layers +from transformers.configuration_utils import PretrainedConfig class PhiMoEConfig(PretrainedConfig): @@ -336,7 +336,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) + rope_theta = config.rope_parameters.get("rope_theta", 10000) self.self_attn = PhiMoEAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -349,7 +349,7 @@ def __init__( layer_id=layer_id, attention_bias=config.attention_bias, quant_config=quant_config, - rope_scaling=config.rope_scaling, + rope_scaling=config.rope_parameters.get("rope_scaling"), prefix=add_prefix("self_attn", prefix), ) self.block_sparse_moe = PhiMoE( diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index 206908b49001..90368eb15763 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -19,7 +19,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul @@ -40,6 +39,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix +from transformers import PretrainedConfig class QWenMLP(nn.Module): @@ -162,8 +162,8 @@ def __init__( super().__init__() self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") self.attn = QWenAttention( config.hidden_size, config.num_attention_heads, diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index 0760c4645479..6b82aba4ce13 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -200,8 +200,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 1000000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 32768) head_dim = getattr(config, "head_dim", None) dual_chunk_attention_config = getattr( diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index 5e235cfa1f41..33d5f5c4e1b5 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -23,7 +23,6 @@ import torch import torch.nn.functional as F from torch import nn -from transformers import PretrainedConfig from sglang.srt.batch_overlap.two_batch_overlap import model_forward_maybe_tbo from sglang.srt.distributed import ( @@ -82,6 +81,7 @@ make_layers, use_intel_amx_backend, ) +from transformers import PretrainedConfig logger = logging.getLogger(__name__) @@ -439,8 +439,8 @@ def __init__( super().__init__() self.config = config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 8192) qkv_bias = getattr(config, "qkv_bias", True) dual_chunk_attention_config = getattr( diff --git a/python/sglang/srt/models/qwen3.py b/python/sglang/srt/models/qwen3.py index b056317e4ef4..5d26be96bacc 100644 --- a/python/sglang/srt/models/qwen3.py +++ b/python/sglang/srt/models/qwen3.py @@ -213,8 +213,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 1000000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 32768) head_dim = getattr(config, "head_dim", None) self.self_attn = Qwen3Attention( diff --git a/python/sglang/srt/models/qwen3_moe.py b/python/sglang/srt/models/qwen3_moe.py index 3fcf0cfa0cbb..c007a9daeea1 100644 --- a/python/sglang/srt/models/qwen3_moe.py +++ b/python/sglang/srt/models/qwen3_moe.py @@ -23,7 +23,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import ( get_moe_expert_parallel_world_size, @@ -78,6 +77,7 @@ is_non_idle_and_non_empty, is_npu, ) +from transformers import PretrainedConfig _is_cuda = is_cuda() @@ -116,11 +116,11 @@ def compute_yarn_parameters( """ # The config does not contain rope_scaling, which means the model is not using yarn - rope_scaling = getattr(config, "rope_scaling", None) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is None: return 1.0, 0, 0, 1.0 - base = config.rope_theta + base = config.rope_parameters.get("rope_theta", 10000) partial_rotary_factor = ( config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") @@ -559,7 +559,7 @@ def forward_prepare_native( def apply_qk_norm_rope(self, qkv, positions, forward_batch): use_fused = self.use_fused_qk_norm_rope and qkv.dtype == torch.bfloat16 if use_fused: - theta = getattr(self.config, "rope_theta", 10000.0) + theta = self.config.rope_parameters.get("rope_theta", 10000.0) positions = ( positions.view(-1).to(dtype=torch.int32, device=qkv.device).contiguous() ) @@ -681,8 +681,8 @@ def __init__( super().__init__() self.config = config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 8192) head_dim = getattr( config, "head_dim", config.hidden_size // config.num_attention_heads diff --git a/python/sglang/srt/models/qwen3_next.py b/python/sglang/srt/models/qwen3_next.py index a013dca531e4..8f565b5af2e0 100644 --- a/python/sglang/srt/models/qwen3_next.py +++ b/python/sglang/srt/models/qwen3_next.py @@ -615,7 +615,7 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = getattr(config, "rope_theta", 10000) + self.rope_theta = config.rope_parameters.get("rope_theta", 10000) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) if "rope_parameters" in config: self.rope_scaling = getattr(config, "rope_parameters", None) diff --git a/python/sglang/srt/models/solar.py b/python/sglang/srt/models/solar.py index 8f85ad587ab0..26c9e854fd3b 100644 --- a/python/sglang/srt/models/solar.py +++ b/python/sglang/srt/models/solar.py @@ -28,7 +28,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank @@ -55,6 +54,7 @@ kv_cache_scales_loader, ) from sglang.srt.utils import add_prefix, make_layers +from transformers import PretrainedConfig class SolarMLP(nn.Module): @@ -194,8 +194,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py index 2adcfe92ffc5..4d46c16d0c89 100644 --- a/python/sglang/srt/models/stablelm.py +++ b/python/sglang/srt/models/stablelm.py @@ -23,7 +23,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul @@ -43,6 +42,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, is_npu +from transformers import PretrainedConfig _is_npu = is_npu() @@ -144,14 +144,14 @@ def __init__( self.head_dim, rotary_dim=self.rotary_ndims, max_position=self.config.max_position_embeddings, - base=self.config.rope_theta, + base=self.config.rope_parameters.get("rope_theta", 10000), ) else: self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.rotary_ndims, max_position=self.config.max_position_embeddings, - base=self.config.rope_theta, + base=self.config.rope_parameters.get("rope_theta", 10000), dtype=torch.float32, ) self.attn = RadixAttention( diff --git a/python/sglang/srt/models/starcoder2.py b/python/sglang/srt/models/starcoder2.py index bbbcf8aebec4..a413686ea398 100644 --- a/python/sglang/srt/models/starcoder2.py +++ b/python/sglang/srt/models/starcoder2.py @@ -26,7 +26,6 @@ import torch from torch import nn -from transformers import Starcoder2Config from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size from sglang.srt.layers.activation import get_act_fn @@ -47,6 +46,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, make_layers +from transformers import Starcoder2Config class Starcoder2Attention(nn.Module): @@ -80,7 +80,7 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_theta + self.rope_theta = config.rope_parameters.get("rope_theta", 10000) self.max_position_embeddings = config.max_position_embeddings self.use_bias = config.use_bias diff --git a/python/sglang/srt/models/step3_vl.py b/python/sglang/srt/models/step3_vl.py index 5ac9528f94dd..bab5bc610927 100644 --- a/python/sglang/srt/models/step3_vl.py +++ b/python/sglang/srt/models/step3_vl.py @@ -7,8 +7,6 @@ from torch import nn from torch.nn import LayerNorm from torch.nn import functional as F -from transformers import PretrainedConfig -from transformers.activations import ACT2FN from sglang.srt.configs.step3_vl import ( Step3TextConfig, @@ -60,6 +58,8 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix, log_info_on_rank0, make_layers +from transformers import PretrainedConfig +from transformers.activations import ACT2FN logger = logging.getLogger(__name__) @@ -289,8 +289,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 8192) head_dim = getattr( config, "head_dim", config.hidden_size // config.num_attention_heads diff --git a/python/sglang/srt/models/torch_native_llama.py b/python/sglang/srt/models/torch_native_llama.py index 14b327bd1a2c..bf93a65d4598 100644 --- a/python/sglang/srt/models/torch_native_llama.py +++ b/python/sglang/srt/models/torch_native_llama.py @@ -46,7 +46,6 @@ import torch from torch import nn from torch.nn.parameter import Parameter -from transformers import LlamaConfig from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -65,6 +64,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix +from transformers import LlamaConfig tp_size: Optional[int] = None tp_rank: Optional[int] = None @@ -274,8 +274,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): diff --git a/python/sglang/srt/models/xverse.py b/python/sglang/srt/models/xverse.py index f84755b03635..410f223d5e48 100644 --- a/python/sglang/srt/models/xverse.py +++ b/python/sglang/srt/models/xverse.py @@ -20,7 +20,6 @@ import torch from torch import nn -from transformers import LlamaConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul @@ -41,6 +40,7 @@ from sglang.srt.model_executor.model_runner import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix +from transformers import LlamaConfig class XverseMLP(nn.Module): @@ -181,8 +181,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): diff --git a/python/sglang/srt/models/xverse_moe.py b/python/sglang/srt/models/xverse_moe.py index 6067acec6f76..f53b6ae6e7d9 100644 --- a/python/sglang/srt/models/xverse_moe.py +++ b/python/sglang/srt/models/xverse_moe.py @@ -17,7 +17,6 @@ import torch from torch import nn -from transformers import PretrainedConfig from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -46,6 +45,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix +from transformers import PretrainedConfig class XverseMLP(nn.Module): @@ -287,8 +287,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_theta = config.rope_parameters.get("rope_theta", 10000) + rope_scaling = config.rope_parameters.get("rope_scaling") max_position_embeddings = getattr(config, "max_position_embeddings", 8192) num_key_value_heads = getattr( config, "num_key_value_heads", config.num_attention_heads diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py index c21339ef8a9e..8dc3a1a2fc82 100644 --- a/python/sglang/srt/multimodal/processors/base_processor.py +++ b/python/sglang/srt/multimodal/processors/base_processor.py @@ -209,6 +209,8 @@ def __init__( "aspect_ratio_mask": Modality.IMAGE, "num_patches": Modality.IMAGE, "patch_pixel_values": Modality.IMAGE, + "pixel_attention_mask": Modality.IMAGE, + "spatial_shapes": Modality.IMAGE, "block_sizes": Modality.IMAGE, "grid_thws": Modality.IMAGE, # for kimi k2.5 # Audio-related attributes diff --git a/python/sglang/srt/multimodal/processors/lfm2_vl.py b/python/sglang/srt/multimodal/processors/lfm2_vl.py new file mode 100644 index 000000000000..ad02275be4ca --- /dev/null +++ b/python/sglang/srt/multimodal/processors/lfm2_vl.py @@ -0,0 +1,45 @@ +from typing import Dict, List, Union + +from sglang.srt.managers.multimodal_processor import ( + BaseMultimodalProcessor as SGLangBaseProcessor, +) +from sglang.srt.models.lfm2_vl import Lfm2VlForConditionalGeneration +from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens + + +class Lfm2VlImageProcessor(SGLangBaseProcessor): + models = [Lfm2VlForConditionalGeneration] + + def __init__(self, hf_config, server_args, _processor, *args, **kwargs): + super().__init__(hf_config, server_args, _processor, *args, **kwargs) + + self.IMAGE_TOKEN_ID = hf_config.image_token_id + + self.mm_tokens = MultimodalSpecialTokens( + image_token="", + image_token_id=hf_config.image_token_id, + ).build(_processor) + + async def process_mm_data_async( + self, + image_data: List[Union[str, bytes, Dict]], + input_text, + request_obj, + *args, + **kwargs, + ): + base_output = self.load_mm_data( + prompt=input_text, + image_data=image_data, + multimodal_tokens=self.mm_tokens, + ) + + mm_items, input_ids, _ = self.process_and_combine_mm_data( + base_output, self.mm_tokens + ) + + return { + "input_ids": input_ids.tolist(), + "mm_items": mm_items, + "im_token_id": self.mm_tokens.image_token_id, + } diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index ebf312c170ba..89d4007c351c 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1635,7 +1635,7 @@ def _handle_model_specific_adjustments(self): sm100_default_attention_backend="triton", ) - elif model_arch in ["Lfm2ForCausalLM"]: + elif model_arch in ["Lfm2ForCausalLM", "Lfm2VlForConditionalGeneration"]: self._handle_mamba_radix_cache( model_arch=model_arch, support_mamba_cache=True, diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index 95190904d82a..8cd8e2f6c1d7 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -20,22 +20,22 @@ import torch import torch.nn.functional as F + import transformers +from sglang.srt.entrypoints.engine import Engine +from sglang.srt.model_loader.ci_weight_validation import ci_validate_and_clean_hf_cache +from sglang.srt.utils import get_device, is_npu, load_image +from sglang.srt.utils.hf_transformers_utils import get_tokenizer +from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l from transformers import ( AutoConfig, AutoModel, AutoModelForCausalLM, - AutoModelForVision2Seq, + AutoModelForImageTextToText, AutoProcessor, GenerationConfig, ) -from sglang.srt.entrypoints.engine import Engine -from sglang.srt.model_loader.ci_weight_validation import ci_validate_and_clean_hf_cache -from sglang.srt.utils import get_device, is_npu, load_image -from sglang.srt.utils.hf_transformers_utils import get_tokenizer -from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l - if is_npu(): from sglang.srt.hardware_backend.npu.utils import init_npu_backend @@ -274,7 +274,7 @@ def start_model_process( ).to(get_device()) elif self.model_type == "embedding": if "gme-qwen2-vl" in model_path.lower(): - self.model = AutoModelForVision2Seq.from_pretrained( + self.model = AutoModelForImageTextToText.from_pretrained( model_path, torch_dtype=torch_dtype, trust_remote_code=False, diff --git a/test/registered/core/test_score_api.py b/test/registered/core/test_score_api.py index 465d9d233c12..7cbb128c154b 100644 --- a/test/registered/core/test_score_api.py +++ b/test/registered/core/test_score_api.py @@ -1,11 +1,11 @@ import unittest import torch -from transformers import AutoModelForCausalLM, AutoTokenizer from sglang.srt.entrypoints.engine import Engine from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase +from transformers import AutoModelForCausalLM, AutoTokenizer register_cuda_ci(est_time=260, suite="stage-b-test-large-1-gpu") @@ -85,7 +85,7 @@ def _get_token_ids(self, tokens): try: label_token_ids = [] for token in tokens: - encoding = tokenizer.encode_plus(token, add_special_tokens=False) + encoding = tokenizer(token, add_special_tokens=False) token_ids = encoding["input_ids"] label_token_ids.append(token_ids[0]) return label_token_ids