Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions python/sglang/srt/compilation/piecewise_context_manager.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from __future__ import annotations

from contextlib import contextmanager
from dataclasses import dataclass
from typing import Any, List, Optional
from typing import TYPE_CHECKING, Any, List, Optional

from sglang.srt.model_executor.forward_batch_info import ForwardBatch
if TYPE_CHECKING:
from sglang.srt.model_executor.forward_batch_info import ForwardBatch


@dataclass
Expand Down
44 changes: 24 additions & 20 deletions python/sglang/srt/configs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
from sglang.srt.configs.chatglm import ChatGLMConfig
from sglang.srt.configs.dbrx import DbrxConfig
from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
from sglang.srt.configs.dots_ocr import DotsOCRConfig
from sglang.srt.configs.dots_vlm import DotsVLMConfig
from sglang.srt.configs.exaone import ExaoneConfig
from sglang.srt.configs.falcon_h1 import FalconH1Config
from sglang.srt.configs.janus_pro import MultiModalityConfig
from sglang.srt.configs.jet_nemotron import JetNemotronConfig
from sglang.srt.configs.kimi_linear import KimiLinearConfig
from sglang.srt.configs.kimi_vl import KimiVLConfig
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
from sglang.srt.configs.longcat_flash import LongcatFlashConfig
from sglang.srt.configs.nemotron_h import NemotronHConfig
from sglang.srt.configs.olmo3 import Olmo3Config
from sglang.srt.configs.qwen3_next import Qwen3NextConfig
from sglang.srt.configs.step3_vl import (
Step3TextConfig,
Step3VisionEncoderConfig,
Step3VLConfig,
from sglang.utils import LazyImport

ChatGLMConfig = LazyImport("sglang.srt.configs.chatglm", "ChatGLMConfig")
DbrxConfig = LazyImport("sglang.srt.configs.dbrx", "DbrxConfig")
DeepseekVL2Config = LazyImport("sglang.srt.configs.deepseekvl2", "DeepseekVL2Config")
DotsOCRConfig = LazyImport("sglang.srt.configs.dots_ocr", "DotsOCRConfig")
DotsVLMConfig = LazyImport("sglang.srt.configs.dots_vlm", "DotsVLMConfig")
ExaoneConfig = LazyImport("sglang.srt.configs.exaone", "ExaoneConfig")
FalconH1Config = LazyImport("sglang.srt.configs.falcon_h1", "FalconH1Config")
MultiModalityConfig = LazyImport("sglang.srt.configs.janus_pro", "MultiModalityConfig")
JetNemotronConfig = LazyImport("sglang.srt.configs.jet_nemotron", "JetNemotronConfig")
KimiLinearConfig = LazyImport("sglang.srt.configs.kimi_linear", "KimiLinearConfig")
KimiVLConfig = LazyImport("sglang.srt.configs.kimi_vl", "KimiVLConfig")
MoonViTConfig = LazyImport("sglang.srt.configs.kimi_vl_moonvit", "MoonViTConfig")
LongcatFlashConfig = LazyImport(
"sglang.srt.configs.longcat_flash", "LongcatFlashConfig"
)
NemotronHConfig = LazyImport("sglang.srt.configs.nemotron_h", "NemotronHConfig")
Olmo3Config = LazyImport("sglang.srt.configs.olmo3", "Olmo3Config")
Qwen3NextConfig = LazyImport("sglang.srt.configs.qwen3_next", "Qwen3NextConfig")
Step3VLConfig = LazyImport("sglang.srt.configs.step3_vl", "Step3VLConfig")
Step3TextConfig = LazyImport("sglang.srt.configs.step3_vl", "Step3TextConfig")
Step3VisionEncoderConfig = LazyImport(
"sglang.srt.configs.step3_vl", "Step3VisionEncoderConfig"
)

__all__ = [
Expand Down
11 changes: 8 additions & 3 deletions python/sglang/srt/configs/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@
# limitations under the License.
# ==============================================================================

from __future__ import annotations

import json
import logging
import math
import os
from enum import Enum, IntEnum, auto
from typing import Any, List, Optional, Set, Union
from typing import TYPE_CHECKING, Any, List, Optional, Set, Union

import torch
from transformers import PretrainedConfig

from sglang.srt.environ import envs
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import is_hip, retry
from sglang.srt.utils.hf_transformers_utils import (
Expand All @@ -37,6 +37,9 @@

logger = logging.getLogger(__name__)

if TYPE_CHECKING:
from transformers import PretrainedConfig


class AttentionArch(IntEnum):
MLA = auto()
Expand Down Expand Up @@ -631,6 +634,8 @@ def _validate_quantize_and_serve_config(self):

# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
def _verify_quantization(self) -> None:
from sglang.srt.layers.quantization import QUANTIZATION_METHODS

supported_quantization = [*QUANTIZATION_METHODS]
rocm_supported_quantization = [
"awq",
Expand Down
9 changes: 6 additions & 3 deletions python/sglang/srt/disaggregation/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@
poll_and_all_reduce,
prepare_abort,
)
from sglang.srt.layers.dp_attention import get_attention_tp_size
from sglang.srt.managers.schedule_batch import FINISH_ABORT, RequestStage, ScheduleBatch
from sglang.srt.managers.request_types import FINISH_ABORT, RequestStage
from sglang.srt.managers.utils import GenerationBatchResult
from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
Expand All @@ -67,7 +66,7 @@
logger = logging.getLogger(__name__)

if TYPE_CHECKING:
from sglang.srt.managers.schedule_batch import Req
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
from sglang.srt.managers.scheduler import Scheduler

CLIP_MAX_NEW_TOKEN = get_int_env_var("SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", 4096)
Expand Down Expand Up @@ -225,6 +224,8 @@ def __init__(
self.kv_manager = self._init_kv_manager()

def _init_kv_manager(self) -> BaseKVManager:
from sglang.srt.layers.dp_attention import get_attention_tp_size

kv_args_class = get_kv_class(self.transfer_backend, KVClassType.KVARGS)
kv_args = kv_args_class()

Expand Down Expand Up @@ -884,6 +885,8 @@ def get_next_disagg_decode_batch_to_run(

def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
"""Create a schedulebatch for fake completed prefill"""
from sglang.srt.managers.schedule_batch import ScheduleBatch
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shouldn't do this... You imported ScheduleBatch twice here.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

iiuc, this is because we import it at the top for the type checking, and then here we use it (ScheduleBatch.init_new, only function in this file that doesn't use it as a type) so we need to import it for use in this function. But I can leave it at top-level import if you think that's better and remove from type checking.


if self.grammar_queue:
self.move_ready_grammar_requests()

Expand Down
9 changes: 2 additions & 7 deletions python/sglang/srt/disaggregation/prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,7 @@
poll_and_all_reduce,
prepare_abort,
)
from sglang.srt.managers.schedule_batch import (
FINISH_LENGTH,
Req,
RequestStage,
ScheduleBatch,
)
from sglang.srt.managers.request_types import FINISH_LENGTH, RequestStage
from sglang.srt.mem_cache.common import release_kv_cache
from sglang.srt.mem_cache.memory_pool import (
HybridLinearKVPool,
Expand All @@ -60,6 +55,7 @@
if TYPE_CHECKING:
from torch.distributed import ProcessGroup

from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
from sglang.srt.managers.scheduler import GenerationBatchResult, Scheduler
from sglang.srt.mem_cache.memory_pool import KVCache

Expand Down Expand Up @@ -232,7 +228,6 @@ def pop_bootstrapped(
return_failed_reqs: For PP, on rank 0, also return the failed reqs to notify the next rank
rids_to_check: For PP, on rank > 0, check the rids from the previous rank has consensus with the current rank.
"""

bootstrapped_reqs = []
failed_reqs = []
indices_to_remove = set()
Expand Down
3 changes: 1 addition & 2 deletions python/sglang/srt/disaggregation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import torch
import torch.distributed as dist

from sglang.srt.managers.request_types import FINISH_ABORT
from sglang.srt.utils import is_npu

if TYPE_CHECKING:
Expand Down Expand Up @@ -346,8 +347,6 @@ def is_mla_backend(target_kv_pool) -> bool:


def prepare_abort(req: Req, error_message: str, status_code=None):
from sglang.srt.managers.schedule_batch import FINISH_ABORT

# populate finish metadata and stream output
req.finished_reason = FINISH_ABORT(error_message, status_code)

Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/eplb/expert_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@

from sglang.srt.environ import envs
from sglang.srt.metrics.collector import ExpertDispatchCollector
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import Withable, get_int_env_var, is_npu

_is_npu = is_npu()

if TYPE_CHECKING:
from sglang.srt.eplb.expert_location import ExpertLocationMetadata
from sglang.srt.model_executor.forward_batch_info import ForwardBatch

logger = logging.getLogger(__name__)

Expand Down
5 changes: 4 additions & 1 deletion python/sglang/srt/layers/moe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from sglang.srt.layers.moe.moe_runner import MoeRunner, MoeRunnerConfig
from sglang.srt.layers.moe.utils import (
DeepEPMode,
MoeA2ABackend,
Expand All @@ -12,6 +11,10 @@
is_tbo_enabled,
should_use_flashinfer_cutlass_moe_fp4_allgather,
)
from sglang.utils import LazyImport

MoeRunner = LazyImport("sglang.srt.layers.moe.moe_runner.runner", "MoeRunner")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this (LazyImport(...))is something we can apply to many other files as well, I'm not sure if there's a downside?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure. @merrymercy @fzyzcjy What are your opinions?

MoeRunnerConfig = LazyImport("sglang.srt.layers.moe.moe_runner.base", "MoeRunnerConfig")

__all__ = [
"DeepEPMode",
Expand Down
21 changes: 14 additions & 7 deletions python/sglang/srt/layers/vocab_parallel_embedding.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/model_executor/layers/vocab_parallel_embedding.py

from __future__ import annotations

import logging
from dataclasses import dataclass
from typing import List, Optional, Sequence, Tuple
from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple

import torch
from torch.nn.parameter import Parameter, UninitializedParameter
Expand All @@ -20,19 +22,19 @@
from sglang.srt.layers.amx_utils import PackWeightMethod
from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
from sglang.srt.layers.parameter import BasevLLMParameter
from sglang.srt.layers.quantization.base_config import (
QuantizationConfig,
QuantizeMethodBase,
method_has_implemented_embedding,
)
from sglang.srt.layers.quantization.unquant import UnquantizedEmbeddingMethod
from sglang.srt.utils import (
cpu_has_amx_support,
get_compiler_backend,
is_cpu,
set_weight_attrs,
)

if TYPE_CHECKING:
from sglang.srt.layers.quantization.base_config import (
QuantizationConfig,
QuantizeMethodBase,
)

DEFAULT_VOCAB_PADDING_SIZE = 64

_is_cpu_amx_available = cpu_has_amx_support()
Expand Down Expand Up @@ -255,6 +257,11 @@ def __init__(
)
self.embedding_dim = embedding_dim

from sglang.srt.layers.quantization.base_config import (
method_has_implemented_embedding,
)
from sglang.srt.layers.quantization.unquant import UnquantizedEmbeddingMethod

quant_method = None
if quant_config is not None:
quant_method = quant_config.get_quant_method(self, prefix=prefix)
Expand Down
9 changes: 7 additions & 2 deletions python/sglang/srt/managers/data_parallel_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# ==============================================================================
"""A controller that dispatches requests to multiple data parallel workers."""

from __future__ import annotations

import faulthandler
import logging
import multiprocessing as mp
Expand All @@ -21,7 +23,7 @@
import time
from collections import deque
from enum import Enum, auto
from typing import List, Optional
from typing import TYPE_CHECKING, List, Optional

import psutil
import setproctitle
Expand All @@ -34,7 +36,7 @@
TokenizedGenerateReqInput,
WatchLoadUpdateReq,
)
from sglang.srt.managers.schedule_batch import Req, RequestStage
from sglang.srt.managers.request_types import RequestStage
from sglang.srt.managers.scheduler import run_scheduler_process
from sglang.srt.server_args import (
DP_ATTENTION_HANDSHAKE_PORT_DELTA,
Expand All @@ -60,6 +62,9 @@
from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
from sglang.utils import TypeBasedDispatcher, get_exception_traceback

if TYPE_CHECKING:
from sglang.srt.managers.schedule_batch import Req

logger = logging.getLogger(__name__)


Expand Down
5 changes: 4 additions & 1 deletion python/sglang/srt/managers/io_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
processes (TokenizerManager, DetokenizerManager, Scheduler).
"""

from __future__ import annotations

import copy
import uuid
from abc import ABC
Expand All @@ -24,14 +26,15 @@
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union

from sglang.srt.lora.lora_registry import LoRARef
from sglang.srt.managers.schedule_batch import BaseFinishReason
from sglang.srt.multimodal.mm_utils import has_valid_data
from sglang.srt.sampling.sampling_params import SamplingParams
from sglang.srt.utils import ImageData

# Handle serialization of Image for pydantic
if TYPE_CHECKING:
from PIL.Image import Image

from sglang.srt.managers.request_types import BaseFinishReason
else:
Image = Any

Expand Down
Loading
Loading