sgl-project · merrymercy · Jun 23, 2025 · Jun 22, 2025 · Jun 23, 2025 · Jun 23, 2025
@@ -29,6 +29,7 @@ runtime_common = [
     "msgspec",
     "ninja",
     "orjson",
+    "outlines==0.1.11",
     "packaging",
     "partial_json_parser",
     "pillow",
@@ -50,13 +51,12 @@ runtime_common = [
 srt = [
     "sglang[runtime_common]",
     "sgl-kernel==0.1.9",
-    "flashinfer_python==0.2.6.post1",
     "torch==2.7.1",
     "torchaudio==2.7.1",
     "torchvision==0.22.1",
     "cuda-python",
-    "outlines>=0.0.44,<=0.1.11",
     "einops",
+    "flashinfer_python==0.2.6.post1",
 ]
 
 blackwell = [
@@ -66,7 +66,6 @@ blackwell = [
     "torchaudio==2.7.1",
     "torchvision==0.22.1",
     "cuda-python",
-    "outlines>=0.0.44,<=0.1.11",
     "einops",
     "flashinfer_python==0.2.6.post1",
 ]
@@ -77,23 +76,22 @@ srt_hip = [
     "sglang[runtime_common]",
     "torch",
     "vllm==0.6.7.dev2",
-    "outlines==0.1.11"
 ]
 
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
-srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
+srt_xpu = ["sglang[runtime_common]"]
 
 # For Intel Gaudi(device : hpu) follow the installation guide
 # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
-srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
+srt_hpu = ["sglang[runtime_common]"]
 
 # CPU: currently, there are no pre-built vllm wheels for CPU.
 # To install vllm for CPU, please follow the instruction here:
 # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
-srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"]
+srt_cpu = ["sglang[runtime_common]", "einops"]
 # https://vllm-ascend.readthedocs.io/en/latest/installation.html
-srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
+srt_npu = ["sglang[runtime_common]"]
 
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]

@@ -786,6 +786,7 @@ def set_finish_with_abort(self, error_msg: str):
         self.multimodal_inputs = None
         self.grammar = None
         self.origin_input_ids = [0]  # set it to one token to skip the long prefill
+        self.return_logprob = False
         self.finished_reason = FINISH_ABORT(
             error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
         )

@@ -1335,7 +1335,14 @@ def check_memory(self):
             )
             raise ValueError(msg)
 
-        if len(self.req_to_token_pool.free_slots) != self.req_to_token_pool.size:
+        if self.disaggregation_mode == DisaggregationMode.DECODE:
+            req_total_size = (
+                self.req_to_token_pool.size + self.req_to_token_pool.pre_alloc_size
+            )
+        else:
+            req_total_size = self.req_to_token_pool.size
+
+        if len(self.req_to_token_pool.free_slots) != req_total_size:
             msg = (
                 "req_to_token_pool memory leak detected!"
                 f"available_size={len(self.req_to_token_pool.free_slots)}, "

@@ -1231,7 +1231,7 @@ def _handle_batch_output(
                     state.last_output_offset = len(state.output_ids)
                 else:
                     state.output_ids.extend(recv_obj.output_ids[i])
-                    output_token_ids = state.output_ids
+                    output_token_ids = state.output_ids.copy()
 
                 out_dict = {
                     "output_ids": output_token_ids,

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -1713,9 +1713,8 @@ def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
             dist_init_host, dist_init_port = dist_init_addr
             port_base = int(dist_init_port) + 1
             if dp_rank is None:
-                scheduler_input_port = (
-                    port_base + 3
-                )  # TokenizerManager to DataParallelController
+                # TokenizerManager to DataParallelController
+                scheduler_input_port = port_base + 3
             else:
                 scheduler_input_port = port_base + 3 + 1 + dp_rank
 

diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
@@ -1917,20 +1917,16 @@ def configure_ipv6(dist_init_addr):
     return port, host
 
 
-def rank0_log(msg: str):
-    from sglang.srt.distributed import get_tensor_model_parallel_rank
-
-    if get_tensor_model_parallel_rank() == 0:
-        logger.info(msg)
-
-
 def rank0_print(msg: str):
     from sglang.srt.distributed import get_tensor_model_parallel_rank
 
     if get_tensor_model_parallel_rank() == 0:
         print(msg, flush=True)
 
 
+rank0_log = rank0_print
+
+
 def get_cuda_version():
     if torch.version.cuda:
         return tuple(map(int, torch.version.cuda.split(".")))

@@ -39,14 +39,6 @@ find_package(Torch REQUIRED)
 # clean Torch Flag
 clear_cuda_arches(CMAKE_FLAG)
 
-if("${CUDA_VERSION}" VERSION_EQUAL "12.8")
-  set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM")
-  set(DeepGEMM_TAG "blackwell")
-else()
-  set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM")
-  set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f")
-endif()
-
 include(FetchContent)
 
 # cutlass
@@ -57,7 +49,16 @@ FetchContent_Declare(
     GIT_SHALLOW    OFF
 )
 FetchContent_Populate(repo-cutlass)
+
 # DeepGEMM
+if("${CUDA_VERSION}" VERSION_EQUAL "12.8")
+  set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM")
+  set(DeepGEMM_TAG "blackwell")
+else()
+  set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM")
+  set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f")
+endif()
+
 FetchContent_Declare(
     repo-deepgemm
     GIT_REPOSITORY ${DeepGEMM_REPO}
@@ -107,7 +108,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
     message(STATUS "For aarch64, disable gencode below SM90 by default")
 endif()
 
-
 include_directories(
     ${PROJECT_SOURCE_DIR}/include
     ${PROJECT_SOURCE_DIR}/csrc
@@ -247,8 +247,8 @@ set(SOURCES
     "csrc/moe/ep_moe_reorder_kernel.cu"
     "csrc/moe/ep_moe_silu_and_mul_kernel.cu"
     "csrc/speculative/eagle_utils.cu"
-    "csrc/speculative/speculative_sampling.cu"
     "csrc/speculative/packbit.cu"
+    "csrc/speculative/speculative_sampling.cu"
     "csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
     "csrc/common_extension.cc"
     "${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"

@@ -1,7 +1,7 @@
 from typing import Optional, Union
 
 import torch
-from sgl_kernel.utils import _to_tensor_scalar_tuple, get_cuda_stream
+from sgl_kernel.utils import _to_tensor_scalar_tuple
 
 
 def _top_k_renorm_probs_internal(