Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ runtime_common = [
"msgspec",
"ninja",
"orjson",
"outlines==0.1.11",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Adding outlines as a direct dependency here might cause conflicts with the version ranges specified later. Consider removing it to avoid potential issues.

"packaging",
"partial_json_parser",
"pillow",
Expand All @@ -50,13 +51,12 @@ runtime_common = [
srt = [
"sglang[runtime_common]",
"sgl-kernel==0.1.9",
"flashinfer_python==0.2.6.post1",
"torch==2.7.1",
"torchaudio==2.7.1",
"torchvision==0.22.1",
"cuda-python",
"outlines>=0.0.44,<=0.1.11",
"einops",
"flashinfer_python==0.2.6.post1",
Comment on lines 58 to +59
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Moving flashinfer_python to the end of the list doesn't seem to have a clear purpose. Is there a specific reason for this change?

]

blackwell = [
Expand All @@ -66,7 +66,6 @@ blackwell = [
"torchaudio==2.7.1",
"torchvision==0.22.1",
"cuda-python",
"outlines>=0.0.44,<=0.1.11",
"einops",
"flashinfer_python==0.2.6.post1",
Comment on lines 69 to 70
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Moving flashinfer_python to the end of the list doesn't seem to have a clear purpose. Is there a specific reason for this change?

]
Expand All @@ -77,23 +76,22 @@ srt_hip = [
"sglang[runtime_common]",
"torch",
"vllm==0.6.7.dev2",
"outlines==0.1.11"
]

# xpu is not enabled in public vllm and torch whl,
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
srt_xpu = ["sglang[runtime_common]"]

# For Intel Gaudi(device : hpu) follow the installation guide
# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
srt_hpu = ["sglang[runtime_common]"]

# CPU: currently, there are no pre-built vllm wheels for CPU.
# To install vllm for CPU, please follow the instruction here:
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"]
srt_cpu = ["sglang[runtime_common]", "einops"]
# https://vllm-ascend.readthedocs.io/en/latest/installation.html
srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
srt_npu = ["sglang[runtime_common]"]

openai = ["openai>=1.0", "tiktoken"]
anthropic = ["anthropic>=0.20.0"]
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/managers/schedule_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,6 +786,7 @@ def set_finish_with_abort(self, error_msg: str):
self.multimodal_inputs = None
self.grammar = None
self.origin_input_ids = [0] # set it to one token to skip the long prefill
self.return_logprob = False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Setting return_logprob to False here might inadvertently disable logprob calculation for requests that originally required it. Verify that this is the intended behavior.

self.finished_reason = FINISH_ABORT(
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
)
Expand Down
9 changes: 8 additions & 1 deletion python/sglang/srt/managers/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1335,7 +1335,14 @@ def check_memory(self):
)
raise ValueError(msg)

if len(self.req_to_token_pool.free_slots) != self.req_to_token_pool.size:
if self.disaggregation_mode == DisaggregationMode.DECODE:
req_total_size = (
self.req_to_token_pool.size + self.req_to_token_pool.pre_alloc_size
)
Comment on lines +1338 to +1341
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This conditional logic based on disaggregation_mode affects how req_total_size is calculated. Ensure that this change aligns with the intended memory management strategy for different disaggregation modes.

else:
req_total_size = self.req_to_token_pool.size

if len(self.req_to_token_pool.free_slots) != req_total_size:
msg = (
"req_to_token_pool memory leak detected!"
f"available_size={len(self.req_to_token_pool.free_slots)}, "
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/managers/tokenizer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,7 +1231,7 @@ def _handle_batch_output(
state.last_output_offset = len(state.output_ids)
else:
state.output_ids.extend(recv_obj.output_ids[i])
output_token_ids = state.output_ids
output_token_ids = state.output_ids.copy()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Creating a copy of state.output_ids might introduce a performance overhead, especially if the list is large. Consider whether a copy is truly necessary here, or if the original list can be used without modification.


out_dict = {
"output_ids": output_token_ids,
Expand Down
5 changes: 2 additions & 3 deletions python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -1713,9 +1713,8 @@ def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
dist_init_host, dist_init_port = dist_init_addr
port_base = int(dist_init_port) + 1
if dp_rank is None:
scheduler_input_port = (
port_base + 3
) # TokenizerManager to DataParallelController
# TokenizerManager to DataParallelController
scheduler_input_port = port_base + 3
else:
scheduler_input_port = port_base + 3 + 1 + dp_rank

Expand Down
10 changes: 3 additions & 7 deletions python/sglang/srt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1917,20 +1917,16 @@ def configure_ipv6(dist_init_addr):
return port, host


def rank0_log(msg: str):
from sglang.srt.distributed import get_tensor_model_parallel_rank

if get_tensor_model_parallel_rank() == 0:
logger.info(msg)


def rank0_print(msg: str):
from sglang.srt.distributed import get_tensor_model_parallel_rank

if get_tensor_model_parallel_rank() == 0:
print(msg, flush=True)


rank0_log = rank0_print
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Reassigning rank0_log to rank0_print effectively removes the logger.info functionality. Verify that this change is intentional and that logging is no longer needed in rank 0.



def get_cuda_version():
if torch.version.cuda:
return tuple(map(int, torch.version.cuda.split(".")))
Expand Down
20 changes: 10 additions & 10 deletions sgl-kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,6 @@ find_package(Torch REQUIRED)
# clean Torch Flag
clear_cuda_arches(CMAKE_FLAG)

if("${CUDA_VERSION}" VERSION_EQUAL "12.8")
set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM")
set(DeepGEMM_TAG "blackwell")
else()
set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM")
set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f")
endif()

include(FetchContent)

# cutlass
Expand All @@ -57,7 +49,16 @@ FetchContent_Declare(
GIT_SHALLOW OFF
)
FetchContent_Populate(repo-cutlass)

# DeepGEMM
if("${CUDA_VERSION}" VERSION_EQUAL "12.8")
set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM")
set(DeepGEMM_TAG "blackwell")
else()
set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM")
set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f")
endif()

FetchContent_Declare(
repo-deepgemm
GIT_REPOSITORY ${DeepGEMM_REPO}
Expand Down Expand Up @@ -107,7 +108,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
message(STATUS "For aarch64, disable gencode below SM90 by default")
endif()


include_directories(
${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/csrc
Expand Down Expand Up @@ -247,8 +247,8 @@ set(SOURCES
"csrc/moe/ep_moe_reorder_kernel.cu"
"csrc/moe/ep_moe_silu_and_mul_kernel.cu"
"csrc/speculative/eagle_utils.cu"
"csrc/speculative/speculative_sampling.cu"
"csrc/speculative/packbit.cu"
"csrc/speculative/speculative_sampling.cu"
Comment on lines 250 to +251
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The order of these two source files has been swapped. Verify that this change doesn't introduce any unintended consequences due to dependency issues or compilation order.

"csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
"csrc/common_extension.cc"
"${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"
Expand Down
2 changes: 1 addition & 1 deletion sgl-kernel/python/sgl_kernel/sampling.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Optional, Union

import torch
from sgl_kernel.utils import _to_tensor_scalar_tuple, get_cuda_stream
from sgl_kernel.utils import _to_tensor_scalar_tuple
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The removal of get_cuda_stream might affect the intended behavior of the code if it relies on a specific CUDA stream. Verify that this change doesn't introduce any issues related to stream synchronization or data dependencies.



def _top_k_renorm_probs_internal(
Expand Down
Loading