sgl-project · vincentzed · Jan 29, 2026 · Jan 29, 2026 · Jan 31, 2026 · Feb 15, 2026
@@ -11,7 +11,6 @@
 from pathlib import Path
 
 import filelock
-import huggingface_hub.constants
 import torch
 from safetensors.torch import safe_open
 from tqdm.auto import tqdm
@@ -35,21 +34,6 @@
 temp_dir = tempfile.gettempdir()
 
 
-def enable_hf_transfer() -> None:
-    """automatically activates hf_transfer"""
-    if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
-        try:
-            # enable hf hub transfer if available
-            import hf_transfer  # type: ignore # noqa
-
-            huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
-        except ImportError:
-            pass
-
-
-enable_hf_transfer()
-
-
 class DisabledTqdm(tqdm):
 
     def __init__(self, *args, **kwargs):

@@ -226,8 +226,8 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        rope_theta = config.rope_parameters.get("rope_theta", 10000)
+        rope_scaling = config.rope_parameters.get("rope_scaling")
         if rope_scaling is not None and getattr(
             config, "original_max_position_embeddings", None
         ):

@@ -204,8 +204,8 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 1000000.0)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        rope_theta = config.rope_parameters.get("rope_theta", 1000000.0)
+        rope_scaling = config.rope_parameters.get("rope_scaling")
         max_position_embeddings = getattr(config, "max_position_embeddings", 40960)
         attention_bias = getattr(config, "attention_bias", False)
 

diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py
@@ -16,6 +16,7 @@
 from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
 from sglang.srt.configs.lfm2 import Lfm2Config
 from sglang.srt.configs.lfm2_moe import Lfm2MoeConfig
+from sglang.srt.configs.lfm2_vl import Lfm2VlConfig
 from sglang.srt.configs.longcat_flash import LongcatFlashConfig
 from sglang.srt.configs.nano_nemotron_vl import NemotronH_Nano_VL_V2_Config
 from sglang.srt.configs.nemotron_h import NemotronHConfig
@@ -54,6 +55,7 @@
     "FalconH1Config",
     "Lfm2Config",
     "Lfm2MoeConfig",
+    "Lfm2VlConfig",
     "NemotronHConfig",
     "NemotronH_Nano_VL_V2_Config",
     "JetNemotronConfig",

diff --git a/python/sglang/srt/configs/lfm2_vl.py b/python/sglang/srt/configs/lfm2_vl.py
@@ -0,0 +1,103 @@
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch LFM2-VL model."""
+
+from transformers.configuration_utils import PreTrainedConfig
+# TODO: replace this with the sglang logger?
+import logging
+from transformers import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+class Lfm2VlConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Lfm2VlForConditionalGeneration`]. It is used to instantiate an
+    Lfm2Vl model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Lfm2-VL-1.6B.
+
+    e.g. [LiquidAI/LFM2-VL-1.6B](https://huggingface.co/LiquidAI/LFM2-VL-1.6B)
+
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+
+    Args:
+        vision_config (`AutoConfig | dict`,  *optional*, defaults to `Siglip2ImageConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`AutoConfig | dict`, *optional*, defaults to `Lfm2Config`):
+            The config object or dictionary of the text backbone.
+        image_token_id (`int`, *optional*, defaults to 396):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        projector_hidden_size (`int`, *optional*, defaults to 2560):
+            The hidden size of the multimodal projector.
+        projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
+        projector_use_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to use layernorm in the multimodal projector.
+        downsample_factor (`int`, *optional*, defaults to 2):
+            The downsample_factor factor of the vision backbone.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie the word embeddings of the text backbone.
+    """
+
+    model_type = "lfm2_vl"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_id=396,
+        projector_hidden_act="gelu",
+        projector_hidden_size=2560,
+        projector_bias=True,
+        projector_use_layernorm=True,
+        downsample_factor=2,
+        tie_word_embeddings=True,
+        **kwargs,
+    ):
+        self.image_token_id = image_token_id
+        self.projector_hidden_act = projector_hidden_act
+        self.projector_hidden_size = projector_hidden_size
+        self.projector_bias = projector_bias
+        self.projector_use_layernorm = projector_use_layernorm
+        self.downsample_factor = downsample_factor
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = vision_config.get("model_type", "siglip2_vision_model")
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["siglip2_vision_model"]()
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "lfm2")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["lfm2"]()
+
+        self.vision_config = vision_config
+        self.text_config = text_config
+        self.tie_word_embeddings = getattr(text_config, "tie_embedding", tie_word_embeddings)
+
+        super().__init__(**kwargs)
+
+# Override HuggingFace's Lfm2VlConfig with our version
+# Cannot use .register() because lfm2_vl may already be registered by transformers
+# Directly modify the internal _extra_content dict instead
+CONFIG_MAPPING._extra_content["lfm2_vl"] = Lfm2VlConfig
+
+__all__ = ["Lfm2VlConfig"]
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -1009,7 +1009,8 @@ def _verify_transformers_version(self):
             # The vision config model type for GLM-4.5v is 'glm4v_moe',
             # while for GLM-4.6v, it is 'glm4v_moe_vision'.
         )
-        needs_tf_v5 = is_glm_46vmoe
+        is_lfm2_vl = getattr(self.hf_config, "model_type", None) == "lfm2_vl"
+        needs_tf_v5 = is_glm_46vmoe or is_lfm2_vl
 
         tf_version = version.parse(tf_version_str)
         required_version = version.parse("5.0.0dev0")
@@ -1231,6 +1232,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
     "Mistral3ForConditionalGeneration",
     "MultiModalityCausalLM",
     "MllamaForConditionalGeneration",
+    "Lfm2VlForConditionalGeneration",
     "NemotronH_Nano_VL_V2",
     "PixtralForConditionalGeneration",
     "Qwen2AudioForConditionalGeneration",

@@ -38,6 +38,7 @@
     KimiLinearConfig,
     Lfm2Config,
     Lfm2MoeConfig,
+    Lfm2VlConfig,
     NemotronH_Nano_VL_V2_Config,
     NemotronHConfig,
     Qwen3_5Config,
@@ -1594,6 +1595,8 @@ def mamba2_config(self):
             return config
         if isinstance(config, NemotronH_Nano_VL_V2_Config):
             return config.llm_config
+        if isinstance(config, Lfm2VlConfig):
+            return config.text_config
         return None
 
     @property

diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py
@@ -67,21 +67,6 @@
 logger = logging.getLogger(__name__)
 
 
-def enable_hf_transfer():
-    """automatically activates hf_transfer"""
-    if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
-        try:
-            # enable hf hub transfer if available
-            import hf_transfer  # type: ignore # noqa
-
-            huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
-        except ImportError:
-            pass
-
-
-enable_hf_transfer()
-
-
 # use system-level temp directory for file locks, so that multiple users
 # can share the same lock without error.
 # lock files in the temp directory will be automatically deleted when the

diff --git a/python/sglang/srt/models/afmoe.py b/python/sglang/srt/models/afmoe.py
@@ -29,7 +29,6 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-from transformers import PretrainedConfig
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -59,6 +58,7 @@
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.utils import add_prefix
+from transformers import PretrainedConfig
 
 
 def get_attention_sliding_window_size(config: PretrainedConfig) -> Optional[int]:
@@ -314,8 +314,8 @@ def __init__(
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        rope_theta = config.rope_parameters.get("rope_theta", 10000)
+        rope_scaling = config.rope_parameters.get("rope_scaling")
         partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
         self.rotary_dim = int(self.head_dim * partial_rotary_factor)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)

diff --git a/python/sglang/srt/models/apertus.py b/python/sglang/srt/models/apertus.py
@@ -22,7 +22,6 @@
 
 import torch
 from torch import nn
-from transformers import ApertusConfig
 
 from sglang.srt.distributed import (
     get_pp_group,
@@ -54,6 +53,7 @@
 )
 from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils import add_prefix, make_layers
+from transformers import ApertusConfig
 
 logger = logging.getLogger(__name__)
 
@@ -217,8 +217,8 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        rope_theta = config.rope_parameters.get("rope_theta", 10000)
+        rope_scaling = config.rope_parameters.get("rope_scaling")
         if rope_scaling is not None and getattr(
             config, "original_max_position_embeddings", None
         ):

diff --git a/python/sglang/srt/models/arcee.py b/python/sglang/srt/models/arcee.py
@@ -18,7 +18,6 @@
 
 import torch
 from torch import nn
-from transformers import LlamaConfig
 
 from sglang.srt.distributed import (
     get_pp_group,
@@ -50,6 +49,7 @@
 )
 from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils import add_prefix, make_layers
+from transformers import LlamaConfig
 
 logger = logging.getLogger(__name__)
 
@@ -199,8 +199,8 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        rope_theta = config.rope_parameters.get("rope_theta", 10000)
+        rope_scaling = config.rope_parameters.get("rope_scaling")
         if rope_scaling is not None and getattr(
             config, "original_max_position_embeddings", None
         ):

diff --git a/python/sglang/srt/models/baichuan.py b/python/sglang/srt/models/baichuan.py
@@ -23,7 +23,6 @@
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -47,6 +46,7 @@
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.utils import add_prefix, is_npu
+from transformers import PretrainedConfig
 
 _is_npu = is_npu()
 
@@ -228,7 +228,7 @@ def __init__(
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_theta = config.rope_parameters.get("rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = BaiChuanAttention(
             hidden_size=self.hidden_size,

diff --git a/python/sglang/srt/models/bailing_moe.py b/python/sglang/srt/models/bailing_moe.py
@@ -24,7 +24,6 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-from transformers import PretrainedConfig
 
 from sglang.srt.distributed import (
     get_pp_group,
@@ -82,6 +81,7 @@
 )
 from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty, make_layers
+from transformers import PretrainedConfig
 
 LoraConfig = None
 logger = logging.getLogger(__name__)
@@ -497,8 +497,8 @@ def __init__(
             self.head_dim,
             rotary_dim=self.rotary_dim,
             max_position=config.max_position_embeddings,
-            base=config.rope_theta,
-            rope_scaling=config.rope_scaling,
+            base=config.rope_parameters.get("rope_theta", 10000),
+            rope_scaling=config.rope_parameters.get("rope_scaling"),
         )
 
         self.attn = RadixAttention(

diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py
@@ -43,7 +43,6 @@
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn.parameter import Parameter
-from transformers import Cohere2Config, CohereConfig, PretrainedConfig
 
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
@@ -66,6 +65,7 @@
     maybe_remap_kv_scale_name,
 )
 from sglang.srt.utils import add_prefix, get_compiler_backend, set_weight_attrs
+from transformers import Cohere2Config, CohereConfig, PretrainedConfig
 
 
 @torch.compile(backend=get_compiler_backend())
@@ -171,8 +171,8 @@ def __init__(
         self.max_position_embeddings = getattr(
             config, "model_max_length", None
         ) or getattr(config, "max_position_embeddings", 8192)
-        self.rope_theta = config.rope_theta
-        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.rope_theta = config.rope_parameters.get("rope_theta", 10000)
+        self.rope_scaling = config.rope_parameters.get("rope_scaling")
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
         self.qkv_proj = QKVParallelLinear(
             self.hidden_size,

diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py
@@ -205,7 +205,7 @@ def __init__(
         self.head_dim = self.d_model // self.total_num_heads
         self.total_num_kv_heads = config.attn_config.kv_n_heads
         self.clip_qkv = config.attn_config.clip_qkv
-        self.rope_theta = config.attn_config.rope_theta
+        self.rope_theta = config.attn_config.rope_parameters.get("rope_theta", 10000)
         self.max_position = config.max_seq_len
 
         # pylint: disable=invalid-name