From 7b7b1117e14da493aa8b08443b492b2a43191fc8 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Fri, 20 Jun 2025 14:36:15 +0800
Subject: [PATCH 1/5] [OAI Server Refactor] [ChatCompletions & Completions]
 Implement UsageInfo

---
 .../srt/entrypoints/openai/api_server.py      | 11 +++
 .../srt/entrypoints/openai/serving_base.py    | 27 --------
 .../srt/entrypoints/openai/serving_chat.py    |  6 +-
 .../entrypoints/openai/serving_completions.py |  6 +-
 .../srt/entrypoints/openai/usage_processor.py | 69 +++++++++++++++++++
 5 files changed, 88 insertions(+), 31 deletions(-)
 create mode 100644 python/sglang/srt/entrypoints/openai/usage_processor.py

diff --git a/python/sglang/srt/entrypoints/openai/api_server.py b/python/sglang/srt/entrypoints/openai/api_server.py
index b575275aec2..a3164339563 100644
--- a/python/sglang/srt/entrypoints/openai/api_server.py
+++ b/python/sglang/srt/entrypoints/openai/api_server.py
@@ -192,6 +192,17 @@ async def v1_score_request(raw_request: Request):
     pass
 
 
+@app.api_route("/v1/models/{model_id}", methods=["GET"])
+async def show_model_detail(model_id: str):
+    served_model_name = app.state.tokenizer_manager.served_model_name
+
+    return ModelCard(
+        id=served_model_name,
+        root=served_model_name,
+        max_model_len=app.state.tokenizer_manager.model_config.context_len,
+    )
+
+
 # Additional API endpoints will be implemented in separate serving_*.py modules
 # and mounted as APIRouters in future PRs
 
diff --git a/python/sglang/srt/entrypoints/openai/serving_base.py b/python/sglang/srt/entrypoints/openai/serving_base.py
index 7d26d1707a2..8e22c26c485 100644
--- a/python/sglang/srt/entrypoints/openai/serving_base.py
+++ b/python/sglang/srt/entrypoints/openai/serving_base.py
@@ -114,33 +114,6 @@ def _validate_request(self, request: OpenAIServingRequest) -> Optional[str]:
         """Validate request"""
         pass
 
-    def _calculate_streaming_usage_base(
-        self,
-        prompt_tokens: Dict[int, int],
-        completion_tokens: Dict[int, int],
-        cached_tokens: Dict[int, int],
-        n_choices: int,
-    ) -> UsageInfo:
-        """Calculate usage information for streaming responses (common logic)"""
-        total_prompt_tokens = sum(
-            tokens for i, tokens in prompt_tokens.items() if i % n_choices == 0
-        )
-        total_completion_tokens = sum(tokens for tokens in completion_tokens.values())
-
-        cache_report = self.tokenizer_manager.server_args.enable_cache_report
-        prompt_tokens_details = None
-        if cache_report:
-            cached_tokens_sum = sum(tokens for tokens in cached_tokens.values())
-            if cached_tokens_sum > 0:
-                prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
-
-        return UsageInfo(
-            prompt_tokens=total_prompt_tokens,
-            completion_tokens=total_completion_tokens,
-            total_tokens=total_prompt_tokens + total_completion_tokens,
-            prompt_tokens_details=prompt_tokens_details,
-        )
-
     def create_error_response(
         self,
         message: str,
diff --git a/python/sglang/srt/entrypoints/openai/serving_chat.py b/python/sglang/srt/entrypoints/openai/serving_chat.py
index 0465b59e9ce..49757a8a1d9 100644
--- a/python/sglang/srt/entrypoints/openai/serving_chat.py
+++ b/python/sglang/srt/entrypoints/openai/serving_chat.py
@@ -26,8 +26,8 @@
     TopLogprob,
 )
 from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor
 from sglang.srt.entrypoints.openai.utils import (
-    aggregate_token_usage,
     detect_template_content_format,
     process_content_for_template_format,
     to_openai_style_logprobs,
@@ -658,7 +658,9 @@ def _build_chat_response(
 
         # Calculate usage
         cache_report = self.tokenizer_manager.server_args.enable_cache_report
-        usage = aggregate_token_usage(ret, request.n, cache_report)
+        usage = UsageProcessor.calculate_response_usage(
+            ret, n_choices=request.n, enable_cache_report=cache_report
+        )
 
         return ChatCompletionResponse(
             id=ret[0]["meta_info"]["id"],
diff --git a/python/sglang/srt/entrypoints/openai/serving_completions.py b/python/sglang/srt/entrypoints/openai/serving_completions.py
index 20725987bc2..126d7cb6743 100644
--- a/python/sglang/srt/entrypoints/openai/serving_completions.py
+++ b/python/sglang/srt/entrypoints/openai/serving_completions.py
@@ -18,8 +18,8 @@
     ErrorResponse,
 )
 from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor
 from sglang.srt.entrypoints.openai.utils import (
-    aggregate_token_usage,
     to_openai_style_logprobs,
 )
 from sglang.srt.managers.io_struct import GenerateReqInput
@@ -322,7 +322,9 @@ def _build_completion_response(
 
         # Calculate usage
         cache_report = self.tokenizer_manager.server_args.enable_cache_report
-        usage = aggregate_token_usage(ret, request.n, cache_report)
+        usage = UsageProcessor.calculate_response_usage(
+            ret, n_choices=request.n, enable_cache_report=cache_report
+        )
 
         return CompletionResponse(
             id=ret[0]["meta_info"]["id"],
diff --git a/python/sglang/srt/entrypoints/openai/usage_processor.py b/python/sglang/srt/entrypoints/openai/usage_processor.py
new file mode 100644
index 00000000000..36b8547a56b
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/usage_processor.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+
+from typing import Any, Dict, List, Mapping, Optional
+
+from sglang.srt.entrypoints.openai.utils import calculate_token_usage
+
+
+def _details_if_cached(count: int) -> Optional[Dict[str, int]]:
+    """Return {"cached_tokens": N} only when N > 0 (keeps JSON slim)."""
+    return {"cached_tokens": count} if count > 0 else None
+
+
+class UsageProcessor:
+    """Stateless helpers that turn raw token counts into a UsageInfo."""
+
+    @classmethod
+    def calculate_response_usage(
+        cls,
+        responses: List[Dict[str, Any]],
+        *,
+        n_choices: int = 1,
+        enable_cache_report: bool = False,
+    ):
+        completion = sum(r["meta_info"]["completion_tokens"] for r in responses)
+
+        prompt = sum(
+            responses[i]["meta_info"]["prompt_tokens"]
+            for i in range(0, len(responses), n_choices)
+        )
+
+        cached_details = None
+        if enable_cache_report:
+            cached_total = sum(
+                r["meta_info"].get("cached_tokens", 0) for r in responses
+            )
+            cached_details = _details_if_cached(cached_total)
+
+        return calculate_token_usage(
+            prompt_tokens=prompt,
+            completion_tokens=completion,
+            cached_tokens=cached_details,
+        )
+
+    @classmethod
+    def calculate_streaming_usage(
+        cls,
+        prompt_tokens: Mapping[int, int],
+        completion_tokens: Mapping[int, int],
+        cached_tokens: Mapping[int, int],
+        *,
+        n_choices: int,
+        enable_cache_report: bool = False,
+    ):
+        # index % n_choices==0 marks the first choice of a prompt
+        prompt = sum(tok for idx, tok in prompt_tokens.items() if idx % n_choices == 0)
+
+        completion = sum(completion_tokens.values())
+
+        cached_details = (
+            _details_if_cached(sum(cached_tokens.values()))
+            if enable_cache_report
+            else None
+        )
+
+        return calculate_token_usage(
+            prompt_tokens=prompt,
+            completion_tokens=completion,
+            cached_tokens=cached_details,
+        )

From c773424e46211b6dfde9d84dbf133e2192ba7666 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Fri, 20 Jun 2025 22:30:30 +0800
Subject: [PATCH 2/5] fix lint

---
 python/sglang/srt/entrypoints/openai/serving_completions.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/sglang/srt/entrypoints/openai/serving_completions.py b/python/sglang/srt/entrypoints/openai/serving_completions.py
index 126d7cb6743..9002f8dd2b5 100644
--- a/python/sglang/srt/entrypoints/openai/serving_completions.py
+++ b/python/sglang/srt/entrypoints/openai/serving_completions.py
@@ -19,9 +19,7 @@
 )
 from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
 from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor
-from sglang.srt.entrypoints.openai.utils import (
-    to_openai_style_logprobs,
-)
+from sglang.srt.entrypoints.openai.utils import to_openai_style_logprobs
 from sglang.srt.managers.io_struct import GenerateReqInput
 
 logger = logging.getLogger(__name__)

From 4333c585794340bc64c9a15f0fa118170cd44429 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sat, 21 Jun 2025 01:50:11 +0800
Subject: [PATCH 3/5] rename variables for improved readability

---
 .../srt/entrypoints/openai/usage_processor.py  | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/python/sglang/srt/entrypoints/openai/usage_processor.py b/python/sglang/srt/entrypoints/openai/usage_processor.py
index 36b8547a56b..bacfaa78a66 100644
--- a/python/sglang/srt/entrypoints/openai/usage_processor.py
+++ b/python/sglang/srt/entrypoints/openai/usage_processor.py
@@ -21,9 +21,9 @@ def calculate_response_usage(
         n_choices: int = 1,
         enable_cache_report: bool = False,
     ):
-        completion = sum(r["meta_info"]["completion_tokens"] for r in responses)
+        completion_tokens = sum(r["meta_info"]["completion_tokens"] for r in responses)
 
-        prompt = sum(
+        prompt_tokens = sum(
             responses[i]["meta_info"]["prompt_tokens"]
             for i in range(0, len(responses), n_choices)
         )
@@ -36,8 +36,8 @@ def calculate_response_usage(
             cached_details = _details_if_cached(cached_total)
 
         return calculate_token_usage(
-            prompt_tokens=prompt,
-            completion_tokens=completion,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
             cached_tokens=cached_details,
         )
 
@@ -52,9 +52,11 @@ def calculate_streaming_usage(
         enable_cache_report: bool = False,
     ):
         # index % n_choices==0 marks the first choice of a prompt
-        prompt = sum(tok for idx, tok in prompt_tokens.items() if idx % n_choices == 0)
+        total_prompt_tokens = sum(
+            tok for idx, tok in prompt_tokens.items() if idx % n_choices == 0
+        )
 
-        completion = sum(completion_tokens.values())
+        total_completion_tokens = sum(completion_tokens.values())
 
         cached_details = (
             _details_if_cached(sum(cached_tokens.values()))
@@ -63,7 +65,7 @@ def calculate_streaming_usage(
         )
 
         return calculate_token_usage(
-            prompt_tokens=prompt,
-            completion_tokens=completion,
+            prompt_tokens=total_prompt_tokens,
+            completion_tokens=total_completion_tokens,
             cached_tokens=cached_details,
         )

From a14375308c92cc9f88f6f0436ee822e59fa37cd5 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Fri, 20 Jun 2025 19:34:05 +0000
Subject: [PATCH 4/5] Remove unused function in utils.py and move
 `calculate_token_usage` to UsageProcessor

---
 .../srt/entrypoints/openai/usage_processor.py | 52 +++++++++-------
 python/sglang/srt/entrypoints/openai/utils.py | 59 +------------------
 2 files changed, 32 insertions(+), 79 deletions(-)

diff --git a/python/sglang/srt/entrypoints/openai/usage_processor.py b/python/sglang/srt/entrypoints/openai/usage_processor.py
index bacfaa78a66..c8136829416 100644
--- a/python/sglang/srt/entrypoints/openai/usage_processor.py
+++ b/python/sglang/srt/entrypoints/openai/usage_processor.py
@@ -1,26 +1,25 @@
 from __future__ import annotations
 
-from typing import Any, Dict, List, Mapping, Optional
+from typing import Any, Dict, List, Mapping, Optional, final
 
-from sglang.srt.entrypoints.openai.utils import calculate_token_usage
-
-
-def _details_if_cached(count: int) -> Optional[Dict[str, int]]:
-    """Return {"cached_tokens": N} only when N > 0 (keeps JSON slim)."""
-    return {"cached_tokens": count} if count > 0 else None
+from python.sglang.srt.entrypoints.openai.protocol import UsageInfo
 
 
+@final
 class UsageProcessor:
     """Stateless helpers that turn raw token counts into a UsageInfo."""
 
-    @classmethod
+    @staticmethod
+    def _details_if_cached(count: int) -> Optional[Dict[str, int]]:
+        """Return {"cached_tokens": N} only when N > 0 (keeps JSON slim)."""
+        return {"cached_tokens": count} if count > 0 else None
+
+    @staticmethod
     def calculate_response_usage(
-        cls,
         responses: List[Dict[str, Any]],
-        *,
         n_choices: int = 1,
         enable_cache_report: bool = False,
-    ):
+    ) -> UsageInfo:
         completion_tokens = sum(r["meta_info"]["completion_tokens"] for r in responses)
 
         prompt_tokens = sum(
@@ -33,39 +32,50 @@ def calculate_response_usage(
             cached_total = sum(
                 r["meta_info"].get("cached_tokens", 0) for r in responses
             )
-            cached_details = _details_if_cached(cached_total)
+            cached_details = UsageProcessor._details_if_cached(cached_total)
 
-        return calculate_token_usage(
+        return UsageProcessor.calculate_token_usage(
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
             cached_tokens=cached_details,
         )
 
-    @classmethod
+    @staticmethod
     def calculate_streaming_usage(
-        cls,
         prompt_tokens: Mapping[int, int],
         completion_tokens: Mapping[int, int],
         cached_tokens: Mapping[int, int],
-        *,
         n_choices: int,
         enable_cache_report: bool = False,
-    ):
-        # index % n_choices==0 marks the first choice of a prompt
+    ) -> UsageInfo:
+        # index % n_choices == 0 marks the first choice of a prompt
         total_prompt_tokens = sum(
             tok for idx, tok in prompt_tokens.items() if idx % n_choices == 0
         )
-
         total_completion_tokens = sum(completion_tokens.values())
 
         cached_details = (
-            _details_if_cached(sum(cached_tokens.values()))
+            UsageProcessor._details_if_cached(sum(cached_tokens.values()))
             if enable_cache_report
             else None
         )
 
-        return calculate_token_usage(
+        return UsageProcessor.calculate_token_usage(
             prompt_tokens=total_prompt_tokens,
             completion_tokens=total_completion_tokens,
             cached_tokens=cached_details,
         )
+
+    @staticmethod
+    def calculate_token_usage(
+        prompt_tokens: int,
+        completion_tokens: int,
+        cached_tokens: Optional[Dict[str, int]] = None,
+    ) -> UsageInfo:
+        """Calculate token usage information"""
+        return UsageInfo(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+            prompt_tokens_details=cached_tokens,
+        )
diff --git a/python/sglang/srt/entrypoints/openai/utils.py b/python/sglang/srt/entrypoints/openai/utils.py
index 53c67831cdb..06e5e4dee10 100644
--- a/python/sglang/srt/entrypoints/openai/utils.py
+++ b/python/sglang/srt/entrypoints/openai/utils.py
@@ -1,10 +1,9 @@
 import logging
-from typing import Any, Dict, List, Optional
 
 import jinja2.nodes
 import transformers.utils.chat_template_utils as hf_chat_utils
 
-from sglang.srt.entrypoints.openai.protocol import LogProbs, UsageInfo
+from sglang.srt.entrypoints.openai.protocol import LogProbs
 
 logger = logging.getLogger(__name__)
 
@@ -171,62 +170,6 @@ def process_content_for_template_format(
         return new_msg
 
 
-def calculate_token_usage(
-    prompt_tokens: int,
-    completion_tokens: int,
-    cached_tokens: Optional[Dict[str, int]] = None,
-) -> UsageInfo:
-    """Calculate token usage information"""
-    return UsageInfo(
-        prompt_tokens=prompt_tokens,
-        completion_tokens=completion_tokens,
-        total_tokens=prompt_tokens + completion_tokens,
-        prompt_tokens_details=cached_tokens,
-    )
-
-
-def aggregate_token_usage(
-    responses: List[Dict[str, Any]],
-    n_choices: int = 1,
-    enable_cache_report: bool = False,
-) -> UsageInfo:
-    """Aggregate token usage from multiple responses
-
-    Args:
-        responses: List of response dictionaries with meta_info
-        n_choices: Number of choices per request (for prompt token counting)
-        enable_cache_report: Whether to include cached token details
-
-    Returns:
-        Aggregated UsageInfo
-    """
-    # Sum completion tokens from all responses
-    completion_tokens = sum(
-        response["meta_info"]["completion_tokens"] for response in responses
-    )
-
-    # For prompt tokens, only count every n_choices-th response to avoid double counting
-    prompt_tokens = sum(
-        responses[i]["meta_info"]["prompt_tokens"]
-        for i in range(0, len(responses), n_choices)
-    )
-
-    # Handle cached tokens if cache reporting is enabled
-    cached_tokens_details = None
-    if enable_cache_report:
-        cached_tokens_sum = sum(
-            response["meta_info"].get("cached_tokens", 0) for response in responses
-        )
-        if cached_tokens_sum > 0:
-            cached_tokens_details = {"cached_tokens": cached_tokens_sum}
-
-    return calculate_token_usage(
-        prompt_tokens=prompt_tokens,
-        completion_tokens=completion_tokens,
-        cached_tokens=cached_tokens_details,
-    )
-
-
 def to_openai_style_logprobs(
     input_token_logprobs=None,
     output_token_logprobs=None,

From 4c5be59ec07211c2f358f693259713acc0015b44 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Fri, 20 Jun 2025 21:38:13 +0000
Subject: [PATCH 5/5] Resolving rebase error

- streaming usage should use UsageProcessor as well
---
 python/sglang/srt/entrypoints/openai/serving_chat.py        | 5 +++--
 python/sglang/srt/entrypoints/openai/serving_completions.py | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/entrypoints/openai/serving_chat.py b/python/sglang/srt/entrypoints/openai/serving_chat.py
index 49757a8a1d9..98e622819e3 100644
--- a/python/sglang/srt/entrypoints/openai/serving_chat.py
+++ b/python/sglang/srt/entrypoints/openai/serving_chat.py
@@ -546,11 +546,12 @@ async def _generate_chat_stream(
 
             # Additional usage chunk
             if request.stream_options and request.stream_options.include_usage:
-                usage = self._calculate_streaming_usage_base(
+                usage = UsageProcessor.calculate_streaming_usage(
                     prompt_tokens,
                     completion_tokens,
                     cached_tokens,
-                    request.n,
+                    n_choices=request.n,
+                    enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
                 )
                 usage_chunk = ChatCompletionStreamResponse(
                     id=content["meta_info"]["id"],
diff --git a/python/sglang/srt/entrypoints/openai/serving_completions.py b/python/sglang/srt/entrypoints/openai/serving_completions.py
index 9002f8dd2b5..eea6dbccc1b 100644
--- a/python/sglang/srt/entrypoints/openai/serving_completions.py
+++ b/python/sglang/srt/entrypoints/openai/serving_completions.py
@@ -212,11 +212,12 @@ async def _generate_completion_stream(
 
             # Handle final usage chunk
             if request.stream_options and request.stream_options.include_usage:
-                usage = self._calculate_streaming_usage_base(
+                usage = UsageProcessor.calculate_streaming_usage(
                     prompt_tokens,
                     completion_tokens,
                     cached_tokens,
-                    request.n,
+                    n_choices=request.n,
+                    enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
                 )
                 final_usage_chunk = CompletionStreamResponse(
                     id=content["meta_info"]["id"],