-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Fix a bug in BatchTokenIDOut & Misc style and dependency updates #7457
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,7 @@ runtime_common = [ | |
| "msgspec", | ||
| "ninja", | ||
| "orjson", | ||
| "outlines==0.1.11", | ||
| "packaging", | ||
| "partial_json_parser", | ||
| "pillow", | ||
|
|
@@ -50,13 +51,12 @@ runtime_common = [ | |
| srt = [ | ||
| "sglang[runtime_common]", | ||
| "sgl-kernel==0.1.9", | ||
| "flashinfer_python==0.2.6.post1", | ||
| "torch==2.7.1", | ||
| "torchaudio==2.7.1", | ||
| "torchvision==0.22.1", | ||
| "cuda-python", | ||
| "outlines>=0.0.44,<=0.1.11", | ||
| "einops", | ||
| "flashinfer_python==0.2.6.post1", | ||
|
Comment on lines
58
to
+59
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| ] | ||
|
|
||
| blackwell = [ | ||
|
|
@@ -66,7 +66,6 @@ blackwell = [ | |
| "torchaudio==2.7.1", | ||
| "torchvision==0.22.1", | ||
| "cuda-python", | ||
| "outlines>=0.0.44,<=0.1.11", | ||
| "einops", | ||
| "flashinfer_python==0.2.6.post1", | ||
|
Comment on lines
69
to
70
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| ] | ||
|
|
@@ -77,23 +76,22 @@ srt_hip = [ | |
| "sglang[runtime_common]", | ||
| "torch", | ||
| "vllm==0.6.7.dev2", | ||
| "outlines==0.1.11" | ||
| ] | ||
|
|
||
| # xpu is not enabled in public vllm and torch whl, | ||
| # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm | ||
| srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] | ||
| srt_xpu = ["sglang[runtime_common]"] | ||
|
|
||
| # For Intel Gaudi(device : hpu) follow the installation guide | ||
| # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html | ||
| srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] | ||
| srt_hpu = ["sglang[runtime_common]"] | ||
|
|
||
| # CPU: currently, there are no pre-built vllm wheels for CPU. | ||
| # To install vllm for CPU, please follow the instruction here: | ||
| # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html | ||
| srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"] | ||
| srt_cpu = ["sglang[runtime_common]", "einops"] | ||
| # https://vllm-ascend.readthedocs.io/en/latest/installation.html | ||
| srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] | ||
| srt_npu = ["sglang[runtime_common]"] | ||
|
|
||
| openai = ["openai>=1.0", "tiktoken"] | ||
| anthropic = ["anthropic>=0.20.0"] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -786,6 +786,7 @@ def set_finish_with_abort(self, error_msg: str): | |
| self.multimodal_inputs = None | ||
| self.grammar = None | ||
| self.origin_input_ids = [0] # set it to one token to skip the long prefill | ||
| self.return_logprob = False | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| self.finished_reason = FINISH_ABORT( | ||
| error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError" | ||
| ) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1335,7 +1335,14 @@ def check_memory(self): | |
| ) | ||
| raise ValueError(msg) | ||
|
|
||
| if len(self.req_to_token_pool.free_slots) != self.req_to_token_pool.size: | ||
| if self.disaggregation_mode == DisaggregationMode.DECODE: | ||
| req_total_size = ( | ||
| self.req_to_token_pool.size + self.req_to_token_pool.pre_alloc_size | ||
| ) | ||
|
Comment on lines
+1338
to
+1341
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| else: | ||
| req_total_size = self.req_to_token_pool.size | ||
|
|
||
| if len(self.req_to_token_pool.free_slots) != req_total_size: | ||
| msg = ( | ||
| "req_to_token_pool memory leak detected!" | ||
| f"available_size={len(self.req_to_token_pool.free_slots)}, " | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1231,7 +1231,7 @@ def _handle_batch_output( | |
| state.last_output_offset = len(state.output_ids) | ||
| else: | ||
| state.output_ids.extend(recv_obj.output_ids[i]) | ||
| output_token_ids = state.output_ids | ||
| output_token_ids = state.output_ids.copy() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| out_dict = { | ||
| "output_ids": output_token_ids, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1917,20 +1917,16 @@ def configure_ipv6(dist_init_addr): | |
| return port, host | ||
|
|
||
|
|
||
| def rank0_log(msg: str): | ||
| from sglang.srt.distributed import get_tensor_model_parallel_rank | ||
|
|
||
| if get_tensor_model_parallel_rank() == 0: | ||
| logger.info(msg) | ||
|
|
||
|
|
||
| def rank0_print(msg: str): | ||
| from sglang.srt.distributed import get_tensor_model_parallel_rank | ||
|
|
||
| if get_tensor_model_parallel_rank() == 0: | ||
| print(msg, flush=True) | ||
|
|
||
|
|
||
| rank0_log = rank0_print | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
|
|
||
| def get_cuda_version(): | ||
| if torch.version.cuda: | ||
| return tuple(map(int, torch.version.cuda.split("."))) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,14 +39,6 @@ find_package(Torch REQUIRED) | |
| # clean Torch Flag | ||
| clear_cuda_arches(CMAKE_FLAG) | ||
|
|
||
| if("${CUDA_VERSION}" VERSION_EQUAL "12.8") | ||
| set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM") | ||
| set(DeepGEMM_TAG "blackwell") | ||
| else() | ||
| set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM") | ||
| set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f") | ||
| endif() | ||
|
|
||
| include(FetchContent) | ||
|
|
||
| # cutlass | ||
|
|
@@ -57,7 +49,16 @@ FetchContent_Declare( | |
| GIT_SHALLOW OFF | ||
| ) | ||
| FetchContent_Populate(repo-cutlass) | ||
|
|
||
| # DeepGEMM | ||
| if("${CUDA_VERSION}" VERSION_EQUAL "12.8") | ||
| set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM") | ||
| set(DeepGEMM_TAG "blackwell") | ||
| else() | ||
| set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM") | ||
| set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f") | ||
| endif() | ||
|
|
||
| FetchContent_Declare( | ||
| repo-deepgemm | ||
| GIT_REPOSITORY ${DeepGEMM_REPO} | ||
|
|
@@ -107,7 +108,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") | |
| message(STATUS "For aarch64, disable gencode below SM90 by default") | ||
| endif() | ||
|
|
||
|
|
||
| include_directories( | ||
| ${PROJECT_SOURCE_DIR}/include | ||
| ${PROJECT_SOURCE_DIR}/csrc | ||
|
|
@@ -247,8 +247,8 @@ set(SOURCES | |
| "csrc/moe/ep_moe_reorder_kernel.cu" | ||
| "csrc/moe/ep_moe_silu_and_mul_kernel.cu" | ||
| "csrc/speculative/eagle_utils.cu" | ||
| "csrc/speculative/speculative_sampling.cu" | ||
| "csrc/speculative/packbit.cu" | ||
| "csrc/speculative/speculative_sampling.cu" | ||
|
Comment on lines
250
to
+251
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| "csrc/grammar/apply_token_bitmask_inplace_cuda.cu" | ||
| "csrc/common_extension.cc" | ||
| "${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,7 @@ | ||
| from typing import Optional, Union | ||
|
|
||
| import torch | ||
| from sgl_kernel.utils import _to_tensor_scalar_tuple, get_cuda_stream | ||
| from sgl_kernel.utils import _to_tensor_scalar_tuple | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
|
|
||
| def _top_k_renorm_probs_internal( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding
outlinesas a direct dependency here might cause conflicts with the version ranges specified later. Consider removing it to avoid potential issues.