diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index d2afa4729bb..ac93dc18370 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -113,6 +113,10 @@ jobs: github.event.pull_request.draft == false needs: [unit-test-frontend, unit-test-backend-2-gpu] runs-on: 8-gpu-runner + strategy: + fail-fast: false + matrix: + part: [0, 1] steps: - name: Checkout code uses: actions/checkout@v4 @@ -125,7 +129,7 @@ jobs: timeout-minutes: 20 run: | cd test/srt - python3 run_suite.py --suite per-commit-8-gpu + python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 performance-test-1-gpu-part-1: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && diff --git a/python/pyproject.toml b/python/pyproject.toml index ff3e4486f7f..61ee9b7517e 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -29,6 +29,7 @@ runtime_common = [ "msgspec", "ninja", "orjson", + "outlines==0.1.11", "packaging", "partial_json_parser", "pillow", @@ -50,13 +51,12 @@ runtime_common = [ srt = [ "sglang[runtime_common]", "sgl-kernel==0.1.9", - "flashinfer_python==0.2.6.post1", "torch==2.7.1", "torchaudio==2.7.1", "torchvision==0.22.1", "cuda-python", - "outlines>=0.0.44,<=0.1.11", "einops", + "flashinfer_python==0.2.6.post1", ] blackwell = [ @@ -66,7 +66,6 @@ blackwell = [ "torchaudio==2.7.1", "torchvision==0.22.1", "cuda-python", - "outlines>=0.0.44,<=0.1.11", "einops", "flashinfer_python==0.2.6.post1", ] @@ -77,23 +76,22 @@ srt_hip = [ "sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", - "outlines==0.1.11" ] # xpu is not enabled in public vllm and torch whl, # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm -srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] +srt_xpu = ["sglang[runtime_common]"] # For Intel Gaudi(device : hpu) follow the installation guide # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html -srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] +srt_hpu = ["sglang[runtime_common]"] # CPU: currently, there are no pre-built vllm wheels for CPU. # To install vllm for CPU, please follow the instruction here: # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html -srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"] +srt_cpu = ["sglang[runtime_common]", "einops"] # https://vllm-ascend.readthedocs.io/en/latest/installation.html -srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] +srt_npu = ["sglang[runtime_common]"] openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 6143c5575ce..c2f30335810 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -786,6 +786,7 @@ def set_finish_with_abort(self, error_msg: str): self.multimodal_inputs = None self.grammar = None self.origin_input_ids = [0] # set it to one token to skip the long prefill + self.return_logprob = False self.finished_reason = FINISH_ABORT( error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError" ) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index a0c2997fe00..de55f1f2c4c 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1335,7 +1335,14 @@ def check_memory(self): ) raise ValueError(msg) - if len(self.req_to_token_pool.free_slots) != self.req_to_token_pool.size: + if self.disaggregation_mode == DisaggregationMode.DECODE: + req_total_size = ( + self.req_to_token_pool.size + self.req_to_token_pool.pre_alloc_size + ) + else: + req_total_size = self.req_to_token_pool.size + + if len(self.req_to_token_pool.free_slots) != req_total_size: msg = ( "req_to_token_pool memory leak detected!" f"available_size={len(self.req_to_token_pool.free_slots)}, " diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index fbab668a47c..f46bc8cdcfc 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -1231,7 +1231,7 @@ def _handle_batch_output( state.last_output_offset = len(state.output_ids) else: state.output_ids.extend(recv_obj.output_ids[i]) - output_token_ids = state.output_ids + output_token_ids = state.output_ids.copy() out_dict = { "output_ids": output_token_ids, diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 54e92d0bf78..f151b54a6f6 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1713,9 +1713,8 @@ def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs": dist_init_host, dist_init_port = dist_init_addr port_base = int(dist_init_port) + 1 if dp_rank is None: - scheduler_input_port = ( - port_base + 3 - ) # TokenizerManager to DataParallelController + # TokenizerManager to DataParallelController + scheduler_input_port = port_base + 3 else: scheduler_input_port = port_base + 3 + 1 + dp_rank diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 048f1f4933a..9eb6f23ffbd 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -1917,13 +1917,6 @@ def configure_ipv6(dist_init_addr): return port, host -def rank0_log(msg: str): - from sglang.srt.distributed import get_tensor_model_parallel_rank - - if get_tensor_model_parallel_rank() == 0: - logger.info(msg) - - def rank0_print(msg: str): from sglang.srt.distributed import get_tensor_model_parallel_rank @@ -1931,6 +1924,9 @@ def rank0_print(msg: str): print(msg, flush=True) +rank0_log = rank0_print + + def get_cuda_version(): if torch.version.cuda: return tuple(map(int, torch.version.cuda.split("."))) diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt index 4714ce4c88d..ebba9498202 100644 --- a/sgl-kernel/CMakeLists.txt +++ b/sgl-kernel/CMakeLists.txt @@ -39,14 +39,6 @@ find_package(Torch REQUIRED) # clean Torch Flag clear_cuda_arches(CMAKE_FLAG) -if("${CUDA_VERSION}" VERSION_EQUAL "12.8") - set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM") - set(DeepGEMM_TAG "blackwell") -else() - set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM") - set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f") -endif() - include(FetchContent) # cutlass @@ -57,7 +49,16 @@ FetchContent_Declare( GIT_SHALLOW OFF ) FetchContent_Populate(repo-cutlass) + # DeepGEMM +if("${CUDA_VERSION}" VERSION_EQUAL "12.8") + set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM") + set(DeepGEMM_TAG "blackwell") +else() + set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM") + set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f") +endif() + FetchContent_Declare( repo-deepgemm GIT_REPOSITORY ${DeepGEMM_REPO} @@ -107,7 +108,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message(STATUS "For aarch64, disable gencode below SM90 by default") endif() - include_directories( ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/csrc @@ -247,8 +247,8 @@ set(SOURCES "csrc/moe/ep_moe_reorder_kernel.cu" "csrc/moe/ep_moe_silu_and_mul_kernel.cu" "csrc/speculative/eagle_utils.cu" - "csrc/speculative/speculative_sampling.cu" "csrc/speculative/packbit.cu" + "csrc/speculative/speculative_sampling.cu" "csrc/grammar/apply_token_bitmask_inplace_cuda.cu" "csrc/common_extension.cc" "${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu" diff --git a/sgl-kernel/python/sgl_kernel/sampling.py b/sgl-kernel/python/sgl_kernel/sampling.py index 59bc8c35184..d4856e52cb4 100644 --- a/sgl-kernel/python/sgl_kernel/sampling.py +++ b/sgl-kernel/python/sgl_kernel/sampling.py @@ -1,7 +1,7 @@ from typing import Optional, Union import torch -from sgl_kernel.utils import _to_tensor_scalar_tuple, get_cuda_stream +from sgl_kernel.utils import _to_tensor_scalar_tuple def _top_k_renorm_probs_internal(