From 7b7b1117e14da493aa8b08443b492b2a43191fc8 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Fri, 20 Jun 2025 14:36:15 +0800 Subject: [PATCH 1/5] [OAI Server Refactor] [ChatCompletions & Completions] Implement UsageInfo --- .../srt/entrypoints/openai/api_server.py | 11 +++ .../srt/entrypoints/openai/serving_base.py | 27 -------- .../srt/entrypoints/openai/serving_chat.py | 6 +- .../entrypoints/openai/serving_completions.py | 6 +- .../srt/entrypoints/openai/usage_processor.py | 69 +++++++++++++++++++ 5 files changed, 88 insertions(+), 31 deletions(-) create mode 100644 python/sglang/srt/entrypoints/openai/usage_processor.py diff --git a/python/sglang/srt/entrypoints/openai/api_server.py b/python/sglang/srt/entrypoints/openai/api_server.py index b575275aec2..a3164339563 100644 --- a/python/sglang/srt/entrypoints/openai/api_server.py +++ b/python/sglang/srt/entrypoints/openai/api_server.py @@ -192,6 +192,17 @@ async def v1_score_request(raw_request: Request): pass +@app.api_route("/v1/models/{model_id}", methods=["GET"]) +async def show_model_detail(model_id: str): + served_model_name = app.state.tokenizer_manager.served_model_name + + return ModelCard( + id=served_model_name, + root=served_model_name, + max_model_len=app.state.tokenizer_manager.model_config.context_len, + ) + + # Additional API endpoints will be implemented in separate serving_*.py modules # and mounted as APIRouters in future PRs diff --git a/python/sglang/srt/entrypoints/openai/serving_base.py b/python/sglang/srt/entrypoints/openai/serving_base.py index 7d26d1707a2..8e22c26c485 100644 --- a/python/sglang/srt/entrypoints/openai/serving_base.py +++ b/python/sglang/srt/entrypoints/openai/serving_base.py @@ -114,33 +114,6 @@ def _validate_request(self, request: OpenAIServingRequest) -> Optional[str]: """Validate request""" pass - def _calculate_streaming_usage_base( - self, - prompt_tokens: Dict[int, int], - completion_tokens: Dict[int, int], - cached_tokens: Dict[int, int], - n_choices: int, - ) -> UsageInfo: - """Calculate usage information for streaming responses (common logic)""" - total_prompt_tokens = sum( - tokens for i, tokens in prompt_tokens.items() if i % n_choices == 0 - ) - total_completion_tokens = sum(tokens for tokens in completion_tokens.values()) - - cache_report = self.tokenizer_manager.server_args.enable_cache_report - prompt_tokens_details = None - if cache_report: - cached_tokens_sum = sum(tokens for tokens in cached_tokens.values()) - if cached_tokens_sum > 0: - prompt_tokens_details = {"cached_tokens": cached_tokens_sum} - - return UsageInfo( - prompt_tokens=total_prompt_tokens, - completion_tokens=total_completion_tokens, - total_tokens=total_prompt_tokens + total_completion_tokens, - prompt_tokens_details=prompt_tokens_details, - ) - def create_error_response( self, message: str, diff --git a/python/sglang/srt/entrypoints/openai/serving_chat.py b/python/sglang/srt/entrypoints/openai/serving_chat.py index 0465b59e9ce..49757a8a1d9 100644 --- a/python/sglang/srt/entrypoints/openai/serving_chat.py +++ b/python/sglang/srt/entrypoints/openai/serving_chat.py @@ -26,8 +26,8 @@ TopLogprob, ) from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase +from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor from sglang.srt.entrypoints.openai.utils import ( - aggregate_token_usage, detect_template_content_format, process_content_for_template_format, to_openai_style_logprobs, @@ -658,7 +658,9 @@ def _build_chat_response( # Calculate usage cache_report = self.tokenizer_manager.server_args.enable_cache_report - usage = aggregate_token_usage(ret, request.n, cache_report) + usage = UsageProcessor.calculate_response_usage( + ret, n_choices=request.n, enable_cache_report=cache_report + ) return ChatCompletionResponse( id=ret[0]["meta_info"]["id"], diff --git a/python/sglang/srt/entrypoints/openai/serving_completions.py b/python/sglang/srt/entrypoints/openai/serving_completions.py index 20725987bc2..126d7cb6743 100644 --- a/python/sglang/srt/entrypoints/openai/serving_completions.py +++ b/python/sglang/srt/entrypoints/openai/serving_completions.py @@ -18,8 +18,8 @@ ErrorResponse, ) from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase +from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor from sglang.srt.entrypoints.openai.utils import ( - aggregate_token_usage, to_openai_style_logprobs, ) from sglang.srt.managers.io_struct import GenerateReqInput @@ -322,7 +322,9 @@ def _build_completion_response( # Calculate usage cache_report = self.tokenizer_manager.server_args.enable_cache_report - usage = aggregate_token_usage(ret, request.n, cache_report) + usage = UsageProcessor.calculate_response_usage( + ret, n_choices=request.n, enable_cache_report=cache_report + ) return CompletionResponse( id=ret[0]["meta_info"]["id"], diff --git a/python/sglang/srt/entrypoints/openai/usage_processor.py b/python/sglang/srt/entrypoints/openai/usage_processor.py new file mode 100644 index 00000000000..36b8547a56b --- /dev/null +++ b/python/sglang/srt/entrypoints/openai/usage_processor.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from typing import Any, Dict, List, Mapping, Optional + +from sglang.srt.entrypoints.openai.utils import calculate_token_usage + + +def _details_if_cached(count: int) -> Optional[Dict[str, int]]: + """Return {"cached_tokens": N} only when N > 0 (keeps JSON slim).""" + return {"cached_tokens": count} if count > 0 else None + + +class UsageProcessor: + """Stateless helpers that turn raw token counts into a UsageInfo.""" + + @classmethod + def calculate_response_usage( + cls, + responses: List[Dict[str, Any]], + *, + n_choices: int = 1, + enable_cache_report: bool = False, + ): + completion = sum(r["meta_info"]["completion_tokens"] for r in responses) + + prompt = sum( + responses[i]["meta_info"]["prompt_tokens"] + for i in range(0, len(responses), n_choices) + ) + + cached_details = None + if enable_cache_report: + cached_total = sum( + r["meta_info"].get("cached_tokens", 0) for r in responses + ) + cached_details = _details_if_cached(cached_total) + + return calculate_token_usage( + prompt_tokens=prompt, + completion_tokens=completion, + cached_tokens=cached_details, + ) + + @classmethod + def calculate_streaming_usage( + cls, + prompt_tokens: Mapping[int, int], + completion_tokens: Mapping[int, int], + cached_tokens: Mapping[int, int], + *, + n_choices: int, + enable_cache_report: bool = False, + ): + # index % n_choices==0 marks the first choice of a prompt + prompt = sum(tok for idx, tok in prompt_tokens.items() if idx % n_choices == 0) + + completion = sum(completion_tokens.values()) + + cached_details = ( + _details_if_cached(sum(cached_tokens.values())) + if enable_cache_report + else None + ) + + return calculate_token_usage( + prompt_tokens=prompt, + completion_tokens=completion, + cached_tokens=cached_details, + ) From c773424e46211b6dfde9d84dbf133e2192ba7666 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Fri, 20 Jun 2025 22:30:30 +0800 Subject: [PATCH 2/5] fix lint --- python/sglang/srt/entrypoints/openai/serving_completions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/sglang/srt/entrypoints/openai/serving_completions.py b/python/sglang/srt/entrypoints/openai/serving_completions.py index 126d7cb6743..9002f8dd2b5 100644 --- a/python/sglang/srt/entrypoints/openai/serving_completions.py +++ b/python/sglang/srt/entrypoints/openai/serving_completions.py @@ -19,9 +19,7 @@ ) from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor -from sglang.srt.entrypoints.openai.utils import ( - to_openai_style_logprobs, -) +from sglang.srt.entrypoints.openai.utils import to_openai_style_logprobs from sglang.srt.managers.io_struct import GenerateReqInput logger = logging.getLogger(__name__) From 4333c585794340bc64c9a15f0fa118170cd44429 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sat, 21 Jun 2025 01:50:11 +0800 Subject: [PATCH 3/5] rename variables for improved readability --- .../srt/entrypoints/openai/usage_processor.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/entrypoints/openai/usage_processor.py b/python/sglang/srt/entrypoints/openai/usage_processor.py index 36b8547a56b..bacfaa78a66 100644 --- a/python/sglang/srt/entrypoints/openai/usage_processor.py +++ b/python/sglang/srt/entrypoints/openai/usage_processor.py @@ -21,9 +21,9 @@ def calculate_response_usage( n_choices: int = 1, enable_cache_report: bool = False, ): - completion = sum(r["meta_info"]["completion_tokens"] for r in responses) + completion_tokens = sum(r["meta_info"]["completion_tokens"] for r in responses) - prompt = sum( + prompt_tokens = sum( responses[i]["meta_info"]["prompt_tokens"] for i in range(0, len(responses), n_choices) ) @@ -36,8 +36,8 @@ def calculate_response_usage( cached_details = _details_if_cached(cached_total) return calculate_token_usage( - prompt_tokens=prompt, - completion_tokens=completion, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, cached_tokens=cached_details, ) @@ -52,9 +52,11 @@ def calculate_streaming_usage( enable_cache_report: bool = False, ): # index % n_choices==0 marks the first choice of a prompt - prompt = sum(tok for idx, tok in prompt_tokens.items() if idx % n_choices == 0) + total_prompt_tokens = sum( + tok for idx, tok in prompt_tokens.items() if idx % n_choices == 0 + ) - completion = sum(completion_tokens.values()) + total_completion_tokens = sum(completion_tokens.values()) cached_details = ( _details_if_cached(sum(cached_tokens.values())) @@ -63,7 +65,7 @@ def calculate_streaming_usage( ) return calculate_token_usage( - prompt_tokens=prompt, - completion_tokens=completion, + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, cached_tokens=cached_details, ) From a14375308c92cc9f88f6f0436ee822e59fa37cd5 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Fri, 20 Jun 2025 19:34:05 +0000 Subject: [PATCH 4/5] Remove unused function in utils.py and move `calculate_token_usage` to UsageProcessor --- .../srt/entrypoints/openai/usage_processor.py | 52 +++++++++------- python/sglang/srt/entrypoints/openai/utils.py | 59 +------------------ 2 files changed, 32 insertions(+), 79 deletions(-) diff --git a/python/sglang/srt/entrypoints/openai/usage_processor.py b/python/sglang/srt/entrypoints/openai/usage_processor.py index bacfaa78a66..c8136829416 100644 --- a/python/sglang/srt/entrypoints/openai/usage_processor.py +++ b/python/sglang/srt/entrypoints/openai/usage_processor.py @@ -1,26 +1,25 @@ from __future__ import annotations -from typing import Any, Dict, List, Mapping, Optional +from typing import Any, Dict, List, Mapping, Optional, final -from sglang.srt.entrypoints.openai.utils import calculate_token_usage - - -def _details_if_cached(count: int) -> Optional[Dict[str, int]]: - """Return {"cached_tokens": N} only when N > 0 (keeps JSON slim).""" - return {"cached_tokens": count} if count > 0 else None +from python.sglang.srt.entrypoints.openai.protocol import UsageInfo +@final class UsageProcessor: """Stateless helpers that turn raw token counts into a UsageInfo.""" - @classmethod + @staticmethod + def _details_if_cached(count: int) -> Optional[Dict[str, int]]: + """Return {"cached_tokens": N} only when N > 0 (keeps JSON slim).""" + return {"cached_tokens": count} if count > 0 else None + + @staticmethod def calculate_response_usage( - cls, responses: List[Dict[str, Any]], - *, n_choices: int = 1, enable_cache_report: bool = False, - ): + ) -> UsageInfo: completion_tokens = sum(r["meta_info"]["completion_tokens"] for r in responses) prompt_tokens = sum( @@ -33,39 +32,50 @@ def calculate_response_usage( cached_total = sum( r["meta_info"].get("cached_tokens", 0) for r in responses ) - cached_details = _details_if_cached(cached_total) + cached_details = UsageProcessor._details_if_cached(cached_total) - return calculate_token_usage( + return UsageProcessor.calculate_token_usage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, cached_tokens=cached_details, ) - @classmethod + @staticmethod def calculate_streaming_usage( - cls, prompt_tokens: Mapping[int, int], completion_tokens: Mapping[int, int], cached_tokens: Mapping[int, int], - *, n_choices: int, enable_cache_report: bool = False, - ): - # index % n_choices==0 marks the first choice of a prompt + ) -> UsageInfo: + # index % n_choices == 0 marks the first choice of a prompt total_prompt_tokens = sum( tok for idx, tok in prompt_tokens.items() if idx % n_choices == 0 ) - total_completion_tokens = sum(completion_tokens.values()) cached_details = ( - _details_if_cached(sum(cached_tokens.values())) + UsageProcessor._details_if_cached(sum(cached_tokens.values())) if enable_cache_report else None ) - return calculate_token_usage( + return UsageProcessor.calculate_token_usage( prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, cached_tokens=cached_details, ) + + @staticmethod + def calculate_token_usage( + prompt_tokens: int, + completion_tokens: int, + cached_tokens: Optional[Dict[str, int]] = None, + ) -> UsageInfo: + """Calculate token usage information""" + return UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + prompt_tokens_details=cached_tokens, + ) diff --git a/python/sglang/srt/entrypoints/openai/utils.py b/python/sglang/srt/entrypoints/openai/utils.py index 53c67831cdb..06e5e4dee10 100644 --- a/python/sglang/srt/entrypoints/openai/utils.py +++ b/python/sglang/srt/entrypoints/openai/utils.py @@ -1,10 +1,9 @@ import logging -from typing import Any, Dict, List, Optional import jinja2.nodes import transformers.utils.chat_template_utils as hf_chat_utils -from sglang.srt.entrypoints.openai.protocol import LogProbs, UsageInfo +from sglang.srt.entrypoints.openai.protocol import LogProbs logger = logging.getLogger(__name__) @@ -171,62 +170,6 @@ def process_content_for_template_format( return new_msg -def calculate_token_usage( - prompt_tokens: int, - completion_tokens: int, - cached_tokens: Optional[Dict[str, int]] = None, -) -> UsageInfo: - """Calculate token usage information""" - return UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - prompt_tokens_details=cached_tokens, - ) - - -def aggregate_token_usage( - responses: List[Dict[str, Any]], - n_choices: int = 1, - enable_cache_report: bool = False, -) -> UsageInfo: - """Aggregate token usage from multiple responses - - Args: - responses: List of response dictionaries with meta_info - n_choices: Number of choices per request (for prompt token counting) - enable_cache_report: Whether to include cached token details - - Returns: - Aggregated UsageInfo - """ - # Sum completion tokens from all responses - completion_tokens = sum( - response["meta_info"]["completion_tokens"] for response in responses - ) - - # For prompt tokens, only count every n_choices-th response to avoid double counting - prompt_tokens = sum( - responses[i]["meta_info"]["prompt_tokens"] - for i in range(0, len(responses), n_choices) - ) - - # Handle cached tokens if cache reporting is enabled - cached_tokens_details = None - if enable_cache_report: - cached_tokens_sum = sum( - response["meta_info"].get("cached_tokens", 0) for response in responses - ) - if cached_tokens_sum > 0: - cached_tokens_details = {"cached_tokens": cached_tokens_sum} - - return calculate_token_usage( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - cached_tokens=cached_tokens_details, - ) - - def to_openai_style_logprobs( input_token_logprobs=None, output_token_logprobs=None, From 4c5be59ec07211c2f358f693259713acc0015b44 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Fri, 20 Jun 2025 21:38:13 +0000 Subject: [PATCH 5/5] Resolving rebase error - streaming usage should use UsageProcessor as well --- python/sglang/srt/entrypoints/openai/serving_chat.py | 5 +++-- python/sglang/srt/entrypoints/openai/serving_completions.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/entrypoints/openai/serving_chat.py b/python/sglang/srt/entrypoints/openai/serving_chat.py index 49757a8a1d9..98e622819e3 100644 --- a/python/sglang/srt/entrypoints/openai/serving_chat.py +++ b/python/sglang/srt/entrypoints/openai/serving_chat.py @@ -546,11 +546,12 @@ async def _generate_chat_stream( # Additional usage chunk if request.stream_options and request.stream_options.include_usage: - usage = self._calculate_streaming_usage_base( + usage = UsageProcessor.calculate_streaming_usage( prompt_tokens, completion_tokens, cached_tokens, - request.n, + n_choices=request.n, + enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report, ) usage_chunk = ChatCompletionStreamResponse( id=content["meta_info"]["id"], diff --git a/python/sglang/srt/entrypoints/openai/serving_completions.py b/python/sglang/srt/entrypoints/openai/serving_completions.py index 9002f8dd2b5..eea6dbccc1b 100644 --- a/python/sglang/srt/entrypoints/openai/serving_completions.py +++ b/python/sglang/srt/entrypoints/openai/serving_completions.py @@ -212,11 +212,12 @@ async def _generate_completion_stream( # Handle final usage chunk if request.stream_options and request.stream_options.include_usage: - usage = self._calculate_streaming_usage_base( + usage = UsageProcessor.calculate_streaming_usage( prompt_tokens, completion_tokens, cached_tokens, - request.n, + n_choices=request.n, + enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report, ) final_usage_chunk = CompletionStreamResponse( id=content["meta_info"]["id"],