diff --git a/python/sglang/srt/entrypoints/openai/serving_base.py b/python/sglang/srt/entrypoints/openai/serving_base.py index 669aed7b0462..097e02f66e28 100644 --- a/python/sglang/srt/entrypoints/openai/serving_base.py +++ b/python/sglang/srt/entrypoints/openai/serving_base.py @@ -88,6 +88,9 @@ async def handle_request( """Handle the specific request type with common pattern If you want to override this method, you should be careful to record the validation time. """ + received_time = time.time() + received_time_perf = time.perf_counter() + try: # Validate request validation_start = time.perf_counter() @@ -103,6 +106,12 @@ async def handle_request( if hasattr(adapted_request, "validation_time"): adapted_request.validation_time = validation_time + if hasattr(adapted_request, "received_time"): + adapted_request.received_time = received_time + + if hasattr(adapted_request, "received_time_perf"): + adapted_request.received_time_perf = received_time_perf + # Note(Xinyuan): raw_request below is only used for detecting the connection of the client if hasattr(request, "stream") and request.stream: return await self._handle_streaming_request( diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index b4f9d3335924..9253aed09b6d 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -204,6 +204,12 @@ class GenerateReqInput(BaseReq): # Validation step duration validation_time: Optional[float] = None + # For metrics + received_time: Optional[float] = None + + # Perf_counter equivalents for accurate time calculations + received_time_perf: Optional[float] = None + # For data parallel rank routing data_parallel_rank: Optional[int] = None diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 787de125728c..8278d65466f1 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -431,7 +431,7 @@ async def generate_request( obj: Union[GenerateReqInput, EmbeddingReqInput], request: Optional[fastapi.Request] = None, ): - created_time = time.time() + created_time = obj.received_time if obj.received_time else time.time() self.auto_create_handle_loop() obj.normalize_batch_and_arguments()