Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 0 additions & 16 deletions python/sglang/multimodal_gen/runtime/loader/weight_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from pathlib import Path

import filelock
import huggingface_hub.constants
import torch
from safetensors.torch import safe_open
from tqdm.auto import tqdm
Expand All @@ -35,21 +34,6 @@
temp_dir = tempfile.gettempdir()


def enable_hf_transfer() -> None:
"""automatically activates hf_transfer"""
if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
try:
# enable hf hub transfer if available
import hf_transfer # type: ignore # noqa

huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
except ImportError:
pass


enable_hf_transfer()


class DisabledTqdm(tqdm):

def __init__(self, *args, **kwargs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,8 @@ def __init__(
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
rope_theta = config.rope_parameters.get("rope_theta", 10000)
rope_scaling = config.rope_parameters.get("rope_scaling")
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,8 @@ def __init__(
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 1000000.0)
rope_scaling = getattr(config, "rope_scaling", None)
rope_theta = config.rope_parameters.get("rope_theta", 1000000.0)
rope_scaling = config.rope_parameters.get("rope_scaling")
max_position_embeddings = getattr(config, "max_position_embeddings", 40960)
attention_bias = getattr(config, "attention_bias", False)

Expand Down
2 changes: 2 additions & 0 deletions python/sglang/srt/configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
from sglang.srt.configs.lfm2 import Lfm2Config
from sglang.srt.configs.lfm2_moe import Lfm2MoeConfig
from sglang.srt.configs.lfm2_vl import Lfm2VlConfig
from sglang.srt.configs.longcat_flash import LongcatFlashConfig
from sglang.srt.configs.nano_nemotron_vl import NemotronH_Nano_VL_V2_Config
from sglang.srt.configs.nemotron_h import NemotronHConfig
Expand Down Expand Up @@ -54,6 +55,7 @@
"FalconH1Config",
"Lfm2Config",
"Lfm2MoeConfig",
"Lfm2VlConfig",
"NemotronHConfig",
"NemotronH_Nano_VL_V2_Config",
"JetNemotronConfig",
Expand Down
103 changes: 103 additions & 0 deletions python/sglang/srt/configs/lfm2_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch LFM2-VL model."""

from transformers.configuration_utils import PreTrainedConfig
# TODO: replace this with the sglang logger?
import logging
from transformers import CONFIG_MAPPING, AutoConfig


logger = logging.getLogger(__name__)


class Lfm2VlConfig(PreTrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Lfm2VlForConditionalGeneration`]. It is used to instantiate an
Lfm2Vl model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Lfm2-VL-1.6B.

e.g. [LiquidAI/LFM2-VL-1.6B](https://huggingface.co/LiquidAI/LFM2-VL-1.6B)

Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.

Args:
vision_config (`AutoConfig | dict`, *optional*, defaults to `Siglip2ImageConfig`):
The config object or dictionary of the vision backbone.
text_config (`AutoConfig | dict`, *optional*, defaults to `Lfm2Config`):
The config object or dictionary of the text backbone.
image_token_id (`int`, *optional*, defaults to 396):
The image token index to encode the image prompt.
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
The activation function used by the multimodal projector.
projector_hidden_size (`int`, *optional*, defaults to 2560):
The hidden size of the multimodal projector.
projector_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the multimodal projector.
projector_use_layernorm (`bool`, *optional*, defaults to `True`):
Whether to use layernorm in the multimodal projector.
downsample_factor (`int`, *optional*, defaults to 2):
The downsample_factor factor of the vision backbone.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie the word embeddings of the text backbone.
"""

model_type = "lfm2_vl"
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}

def __init__(
self,
vision_config=None,
text_config=None,
image_token_id=396,
projector_hidden_act="gelu",
projector_hidden_size=2560,
projector_bias=True,
projector_use_layernorm=True,
downsample_factor=2,
tie_word_embeddings=True,
**kwargs,
):
self.image_token_id = image_token_id
self.projector_hidden_act = projector_hidden_act
self.projector_hidden_size = projector_hidden_size
self.projector_bias = projector_bias
self.projector_use_layernorm = projector_use_layernorm
self.downsample_factor = downsample_factor

if isinstance(vision_config, dict):
vision_config["model_type"] = vision_config.get("model_type", "siglip2_vision_model")
vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
elif vision_config is None:
vision_config = CONFIG_MAPPING["siglip2_vision_model"]()

if isinstance(text_config, dict):
text_config["model_type"] = text_config.get("model_type", "lfm2")
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
elif text_config is None:
text_config = CONFIG_MAPPING["lfm2"]()

self.vision_config = vision_config
self.text_config = text_config
self.tie_word_embeddings = getattr(text_config, "tie_embedding", tie_word_embeddings)

super().__init__(**kwargs)

# Override HuggingFace's Lfm2VlConfig with our version
# Cannot use .register() because lfm2_vl may already be registered by transformers
# Directly modify the internal _extra_content dict instead
CONFIG_MAPPING._extra_content["lfm2_vl"] = Lfm2VlConfig

__all__ = ["Lfm2VlConfig"]
4 changes: 3 additions & 1 deletion python/sglang/srt/configs/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1009,7 +1009,8 @@ def _verify_transformers_version(self):
# The vision config model type for GLM-4.5v is 'glm4v_moe',
# while for GLM-4.6v, it is 'glm4v_moe_vision'.
)
needs_tf_v5 = is_glm_46vmoe
is_lfm2_vl = getattr(self.hf_config, "model_type", None) == "lfm2_vl"
needs_tf_v5 = is_glm_46vmoe or is_lfm2_vl

tf_version = version.parse(tf_version_str)
required_version = version.parse("5.0.0dev0")
Expand Down Expand Up @@ -1231,6 +1232,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
"Mistral3ForConditionalGeneration",
"MultiModalityCausalLM",
"MllamaForConditionalGeneration",
"Lfm2VlForConditionalGeneration",
"NemotronH_Nano_VL_V2",
"PixtralForConditionalGeneration",
"Qwen2AudioForConditionalGeneration",
Expand Down
3 changes: 3 additions & 0 deletions python/sglang/srt/model_executor/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
KimiLinearConfig,
Lfm2Config,
Lfm2MoeConfig,
Lfm2VlConfig,
NemotronH_Nano_VL_V2_Config,
NemotronHConfig,
Qwen3_5Config,
Expand Down Expand Up @@ -1594,6 +1595,8 @@ def mamba2_config(self):
return config
if isinstance(config, NemotronH_Nano_VL_V2_Config):
return config.llm_config
if isinstance(config, Lfm2VlConfig):
return config.text_config
return None

@property
Expand Down
15 changes: 0 additions & 15 deletions python/sglang/srt/model_loader/weight_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,21 +67,6 @@
logger = logging.getLogger(__name__)


def enable_hf_transfer():
"""automatically activates hf_transfer"""
if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
try:
# enable hf hub transfer if available
import hf_transfer # type: ignore # noqa

huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
except ImportError:
pass


enable_hf_transfer()


# use system-level temp directory for file locks, so that multiple users
# can share the same lock without error.
# lock files in the temp directory will be automatically deleted when the
Expand Down
6 changes: 3 additions & 3 deletions python/sglang/srt/models/afmoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
import torch
import torch.nn.functional as F
from torch import nn
from transformers import PretrainedConfig

from sglang.srt.distributed import (
get_tensor_model_parallel_rank,
Expand Down Expand Up @@ -59,6 +58,7 @@
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.utils import add_prefix
from transformers import PretrainedConfig


def get_attention_sliding_window_size(config: PretrainedConfig) -> Optional[int]:
Expand Down Expand Up @@ -314,8 +314,8 @@ def __init__(
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5

rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
rope_theta = config.rope_parameters.get("rope_theta", 10000)
rope_scaling = config.rope_parameters.get("rope_scaling")
partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
self.rotary_dim = int(self.head_dim * partial_rotary_factor)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
Expand Down
6 changes: 3 additions & 3 deletions python/sglang/srt/models/apertus.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

import torch
from torch import nn
from transformers import ApertusConfig

from sglang.srt.distributed import (
get_pp_group,
Expand Down Expand Up @@ -54,6 +53,7 @@
)
from sglang.srt.server_args import get_global_server_args
from sglang.srt.utils import add_prefix, make_layers
from transformers import ApertusConfig

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -217,8 +217,8 @@ def __init__(
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
rope_theta = config.rope_parameters.get("rope_theta", 10000)
rope_scaling = config.rope_parameters.get("rope_scaling")
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
Expand Down
6 changes: 3 additions & 3 deletions python/sglang/srt/models/arcee.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import torch
from torch import nn
from transformers import LlamaConfig

from sglang.srt.distributed import (
get_pp_group,
Expand Down Expand Up @@ -50,6 +49,7 @@
)
from sglang.srt.server_args import get_global_server_args
from sglang.srt.utils import add_prefix, make_layers
from transformers import LlamaConfig

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -199,8 +199,8 @@ def __init__(
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
rope_theta = config.rope_parameters.get("rope_theta", 10000)
rope_scaling = config.rope_parameters.get("rope_scaling")
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
Expand Down
4 changes: 2 additions & 2 deletions python/sglang/srt/models/baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

import torch
from torch import nn
from transformers import PretrainedConfig

from sglang.srt.distributed import (
get_tensor_model_parallel_rank,
Expand All @@ -47,6 +46,7 @@
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.utils import add_prefix, is_npu
from transformers import PretrainedConfig

_is_npu = is_npu()

Expand Down Expand Up @@ -228,7 +228,7 @@ def __init__(
):
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_theta = config.rope_parameters.get("rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.self_attn = BaiChuanAttention(
hidden_size=self.hidden_size,
Expand Down
6 changes: 3 additions & 3 deletions python/sglang/srt/models/bailing_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import torch
import torch.nn.functional as F
from torch import nn
from transformers import PretrainedConfig

from sglang.srt.distributed import (
get_pp_group,
Expand Down Expand Up @@ -82,6 +81,7 @@
)
from sglang.srt.server_args import get_global_server_args
from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty, make_layers
from transformers import PretrainedConfig

LoraConfig = None
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -497,8 +497,8 @@ def __init__(
self.head_dim,
rotary_dim=self.rotary_dim,
max_position=config.max_position_embeddings,
base=config.rope_theta,
rope_scaling=config.rope_scaling,
base=config.rope_parameters.get("rope_theta", 10000),
rope_scaling=config.rope_parameters.get("rope_scaling"),
)

self.attn = RadixAttention(
Expand Down
6 changes: 3 additions & 3 deletions python/sglang/srt/models/commandr.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import torch.utils.checkpoint
from torch import nn
from torch.nn.parameter import Parameter
from transformers import Cohere2Config, CohereConfig, PretrainedConfig

from sglang.srt.distributed import (
get_tensor_model_parallel_rank,
Expand All @@ -66,6 +65,7 @@
maybe_remap_kv_scale_name,
)
from sglang.srt.utils import add_prefix, get_compiler_backend, set_weight_attrs
from transformers import Cohere2Config, CohereConfig, PretrainedConfig


@torch.compile(backend=get_compiler_backend())
Expand Down Expand Up @@ -171,8 +171,8 @@ def __init__(
self.max_position_embeddings = getattr(
config, "model_max_length", None
) or getattr(config, "max_position_embeddings", 8192)
self.rope_theta = config.rope_theta
self.rope_scaling = getattr(config, "rope_scaling", None)
self.rope_theta = config.rope_parameters.get("rope_theta", 10000)
self.rope_scaling = config.rope_parameters.get("rope_scaling")
self.use_qk_norm = getattr(config, "use_qk_norm", False)
self.qkv_proj = QKVParallelLinear(
self.hidden_size,
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/models/dbrx.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def __init__(
self.head_dim = self.d_model // self.total_num_heads
self.total_num_kv_heads = config.attn_config.kv_n_heads
self.clip_qkv = config.attn_config.clip_qkv
self.rope_theta = config.attn_config.rope_theta
self.rope_theta = config.attn_config.rope_parameters.get("rope_theta", 10000)
self.max_position = config.max_seq_len

# pylint: disable=invalid-name
Expand Down
Loading