sgl-project · raayandhar · Nov 2, 2025 · Nov 2, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/python/sglang/srt/compilation/piecewise_context_manager.py b/python/sglang/srt/compilation/piecewise_context_manager.py
@@ -1,8 +1,11 @@
+from __future__ import annotations
+
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, List, Optional
+from typing import TYPE_CHECKING, Any, List, Optional
 
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 
 
 @dataclass

diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py
@@ -1,23 +1,27 @@
-from sglang.srt.configs.chatglm import ChatGLMConfig
-from sglang.srt.configs.dbrx import DbrxConfig
-from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
-from sglang.srt.configs.dots_ocr import DotsOCRConfig
-from sglang.srt.configs.dots_vlm import DotsVLMConfig
-from sglang.srt.configs.exaone import ExaoneConfig
-from sglang.srt.configs.falcon_h1 import FalconH1Config
-from sglang.srt.configs.janus_pro import MultiModalityConfig
-from sglang.srt.configs.jet_nemotron import JetNemotronConfig
-from sglang.srt.configs.kimi_linear import KimiLinearConfig
-from sglang.srt.configs.kimi_vl import KimiVLConfig
-from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
-from sglang.srt.configs.longcat_flash import LongcatFlashConfig
-from sglang.srt.configs.nemotron_h import NemotronHConfig
-from sglang.srt.configs.olmo3 import Olmo3Config
-from sglang.srt.configs.qwen3_next import Qwen3NextConfig
-from sglang.srt.configs.step3_vl import (
-    Step3TextConfig,
-    Step3VisionEncoderConfig,
-    Step3VLConfig,
+from sglang.utils import LazyImport
+
+ChatGLMConfig = LazyImport("sglang.srt.configs.chatglm", "ChatGLMConfig")
+DbrxConfig = LazyImport("sglang.srt.configs.dbrx", "DbrxConfig")
+DeepseekVL2Config = LazyImport("sglang.srt.configs.deepseekvl2", "DeepseekVL2Config")
+DotsOCRConfig = LazyImport("sglang.srt.configs.dots_ocr", "DotsOCRConfig")
+DotsVLMConfig = LazyImport("sglang.srt.configs.dots_vlm", "DotsVLMConfig")
+ExaoneConfig = LazyImport("sglang.srt.configs.exaone", "ExaoneConfig")
+FalconH1Config = LazyImport("sglang.srt.configs.falcon_h1", "FalconH1Config")
+MultiModalityConfig = LazyImport("sglang.srt.configs.janus_pro", "MultiModalityConfig")
+JetNemotronConfig = LazyImport("sglang.srt.configs.jet_nemotron", "JetNemotronConfig")
+KimiLinearConfig = LazyImport("sglang.srt.configs.kimi_linear", "KimiLinearConfig")
+KimiVLConfig = LazyImport("sglang.srt.configs.kimi_vl", "KimiVLConfig")
+MoonViTConfig = LazyImport("sglang.srt.configs.kimi_vl_moonvit", "MoonViTConfig")
+LongcatFlashConfig = LazyImport(
+    "sglang.srt.configs.longcat_flash", "LongcatFlashConfig"
+)
+NemotronHConfig = LazyImport("sglang.srt.configs.nemotron_h", "NemotronHConfig")
+Olmo3Config = LazyImport("sglang.srt.configs.olmo3", "Olmo3Config")
+Qwen3NextConfig = LazyImport("sglang.srt.configs.qwen3_next", "Qwen3NextConfig")
+Step3VLConfig = LazyImport("sglang.srt.configs.step3_vl", "Step3VLConfig")
+Step3TextConfig = LazyImport("sglang.srt.configs.step3_vl", "Step3TextConfig")
+Step3VisionEncoderConfig = LazyImport(
+    "sglang.srt.configs.step3_vl", "Step3VisionEncoderConfig"
 )
 
 __all__ = [

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -12,18 +12,18 @@
 # limitations under the License.
 # ==============================================================================
 
+from __future__ import annotations
+
 import json
 import logging
 import math
 import os
 from enum import Enum, IntEnum, auto
-from typing import Any, List, Optional, Set, Union
+from typing import TYPE_CHECKING, Any, List, Optional, Set, Union
 
 import torch
-from transformers import PretrainedConfig
 
 from sglang.srt.environ import envs
-from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import is_hip, retry
 from sglang.srt.utils.hf_transformers_utils import (
@@ -37,6 +37,9 @@
 
 logger = logging.getLogger(__name__)
 
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
 
 class AttentionArch(IntEnum):
     MLA = auto()
@@ -631,6 +634,8 @@ def _validate_quantize_and_serve_config(self):
 
     # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
     def _verify_quantization(self) -> None:
+        from sglang.srt.layers.quantization import QUANTIZATION_METHODS
+
         supported_quantization = [*QUANTIZATION_METHODS]
         rocm_supported_quantization = [
             "awq",

@@ -46,8 +46,7 @@
     poll_and_all_reduce,
     prepare_abort,
 )
-from sglang.srt.layers.dp_attention import get_attention_tp_size
-from sglang.srt.managers.schedule_batch import FINISH_ABORT, RequestStage, ScheduleBatch
+from sglang.srt.managers.request_types import FINISH_ABORT, RequestStage
 from sglang.srt.managers.utils import GenerationBatchResult
 from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
@@ -67,7 +66,7 @@
 logger = logging.getLogger(__name__)
 
 if TYPE_CHECKING:
-    from sglang.srt.managers.schedule_batch import Req
+    from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
     from sglang.srt.managers.scheduler import Scheduler
 
 CLIP_MAX_NEW_TOKEN = get_int_env_var("SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", 4096)
@@ -225,6 +224,8 @@ def __init__(
         self.kv_manager = self._init_kv_manager()
 
     def _init_kv_manager(self) -> BaseKVManager:
+        from sglang.srt.layers.dp_attention import get_attention_tp_size
+
         kv_args_class = get_kv_class(self.transfer_backend, KVClassType.KVARGS)
         kv_args = kv_args_class()
 
@@ -884,6 +885,8 @@ def get_next_disagg_decode_batch_to_run(
 
     def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
         """Create a schedulebatch for fake completed prefill"""
+        from sglang.srt.managers.schedule_batch import ScheduleBatch
+
         if self.grammar_queue:
             self.move_ready_grammar_requests()
 

@@ -42,12 +42,7 @@
     poll_and_all_reduce,
     prepare_abort,
 )
-from sglang.srt.managers.schedule_batch import (
-    FINISH_LENGTH,
-    Req,
-    RequestStage,
-    ScheduleBatch,
-)
+from sglang.srt.managers.request_types import FINISH_LENGTH, RequestStage
 from sglang.srt.mem_cache.common import release_kv_cache
 from sglang.srt.mem_cache.memory_pool import (
     HybridLinearKVPool,
@@ -60,6 +55,7 @@
 if TYPE_CHECKING:
     from torch.distributed import ProcessGroup
 
+    from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
     from sglang.srt.managers.scheduler import GenerationBatchResult, Scheduler
     from sglang.srt.mem_cache.memory_pool import KVCache
 
@@ -232,7 +228,6 @@ def pop_bootstrapped(
         return_failed_reqs: For PP, on rank 0, also return the failed reqs to notify the next rank
         rids_to_check: For PP, on rank > 0, check the rids from the previous rank has consensus with the current rank.
         """
-
         bootstrapped_reqs = []
         failed_reqs = []
         indices_to_remove = set()

@@ -11,6 +11,7 @@
 import torch
 import torch.distributed as dist
 
+from sglang.srt.managers.request_types import FINISH_ABORT
 from sglang.srt.utils import is_npu
 
 if TYPE_CHECKING:
@@ -346,8 +347,6 @@ def is_mla_backend(target_kv_pool) -> bool:
 
 
 def prepare_abort(req: Req, error_message: str, status_code=None):
-    from sglang.srt.managers.schedule_batch import FINISH_ABORT
-
     # populate finish metadata and stream output
     req.finished_reason = FINISH_ABORT(error_message, status_code)
 

@@ -29,14 +29,14 @@
 
 from sglang.srt.environ import envs
 from sglang.srt.metrics.collector import ExpertDispatchCollector
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import Withable, get_int_env_var, is_npu
 
 _is_npu = is_npu()
 
 if TYPE_CHECKING:
     from sglang.srt.eplb.expert_location import ExpertLocationMetadata
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 
 logger = logging.getLogger(__name__)
 

@@ -1,4 +1,3 @@
-from sglang.srt.layers.moe.moe_runner import MoeRunner, MoeRunnerConfig
 from sglang.srt.layers.moe.utils import (
     DeepEPMode,
     MoeA2ABackend,
@@ -12,6 +11,10 @@
     is_tbo_enabled,
     should_use_flashinfer_cutlass_moe_fp4_allgather,
 )
+from sglang.utils import LazyImport
+
+MoeRunner = LazyImport("sglang.srt.layers.moe.moe_runner.runner", "MoeRunner")
+MoeRunnerConfig = LazyImport("sglang.srt.layers.moe.moe_runner.base", "MoeRunnerConfig")
 
 __all__ = [
     "DeepEPMode",

@@ -1,8 +1,10 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/model_executor/layers/vocab_parallel_embedding.py
 
+from __future__ import annotations
+
 import logging
 from dataclasses import dataclass
-from typing import List, Optional, Sequence, Tuple
+from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple
 
 import torch
 from torch.nn.parameter import Parameter, UninitializedParameter
@@ -20,19 +22,19 @@
 from sglang.srt.layers.amx_utils import PackWeightMethod
 from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
 from sglang.srt.layers.parameter import BasevLLMParameter
-from sglang.srt.layers.quantization.base_config import (
-    QuantizationConfig,
-    QuantizeMethodBase,
-    method_has_implemented_embedding,
-)
-from sglang.srt.layers.quantization.unquant import UnquantizedEmbeddingMethod
 from sglang.srt.utils import (
     cpu_has_amx_support,
     get_compiler_backend,
     is_cpu,
     set_weight_attrs,
 )
 
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization.base_config import (
+        QuantizationConfig,
+        QuantizeMethodBase,
+    )
+
 DEFAULT_VOCAB_PADDING_SIZE = 64
 
 _is_cpu_amx_available = cpu_has_amx_support()
@@ -255,6 +257,11 @@ def __init__(
         )
         self.embedding_dim = embedding_dim
 
+        from sglang.srt.layers.quantization.base_config import (
+            method_has_implemented_embedding,
+        )
+        from sglang.srt.layers.quantization.unquant import UnquantizedEmbeddingMethod
+
         quant_method = None
         if quant_config is not None:
             quant_method = quant_config.get_quant_method(self, prefix=prefix)

@@ -13,6 +13,8 @@
 # ==============================================================================
 """A controller that dispatches requests to multiple data parallel workers."""
 
+from __future__ import annotations
+
 import faulthandler
 import logging
 import multiprocessing as mp
@@ -21,7 +23,7 @@
 import time
 from collections import deque
 from enum import Enum, auto
-from typing import List, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 import psutil
 import setproctitle
@@ -34,7 +36,7 @@
     TokenizedGenerateReqInput,
     WatchLoadUpdateReq,
 )
-from sglang.srt.managers.schedule_batch import Req, RequestStage
+from sglang.srt.managers.request_types import RequestStage
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.server_args import (
     DP_ATTENTION_HANDSHAKE_PORT_DELTA,
@@ -60,6 +62,9 @@
 from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.utils import TypeBasedDispatcher, get_exception_traceback
 
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+
 logger = logging.getLogger(__name__)
 
 

@@ -16,6 +16,8 @@
 processes (TokenizerManager, DetokenizerManager, Scheduler).
 """
 
+from __future__ import annotations
+
 import copy
 import uuid
 from abc import ABC
@@ -24,14 +26,15 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 from sglang.srt.lora.lora_registry import LoRARef
-from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.multimodal.mm_utils import has_valid_data
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.utils import ImageData
 
 # Handle serialization of Image for pydantic
 if TYPE_CHECKING:
     from PIL.Image import Image
+
+    from sglang.srt.managers.request_types import BaseFinishReason
 else:
     Image = Any