diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py index fba61e3eb569..a467d3fb409f 100644 --- a/python/sglang/test/run_eval.py +++ b/python/sglang/test/run_eval.py @@ -179,9 +179,16 @@ def run_eval(args): if getattr(args, "repeat", 1) == 1: result, latency, sampler = run_eval_once(args, base_url, eval_obj) metrics = result.metrics | {"score": result.score} + metrics["latency"] = latency print(f"Total latency: {latency:.3f} s") print(f"Score: {metrics['score']:.3f}") + # Compute output throughput from accumulated completion tokens + total_completion_tokens = sum(sampler._completion_tokens) + if total_completion_tokens > 0 and latency > 0: + metrics["output_throughput"] = total_completion_tokens / latency + print(f"Output throughput: {metrics['output_throughput']:.3f} token/s") + # Report metrics to unified collection framework dump_metric( f"{args.eval_name}_score", @@ -204,19 +211,31 @@ def run_eval(args): ] scores_repeat = [] + latencies = [] + total_completion_tokens = 0 for f in futures: result, latency, sampler = f.result() scores_repeat.append(result.score) + latencies.append(latency) + total_completion_tokens += sum(sampler._completion_tokens) mean_score = sum(scores_repeat) / len(scores_repeat) + mean_latency = sum(latencies) / len(latencies) + total_latency = sum(latencies) scores_repeat = [f"{s:.3f}" for s in scores_repeat] print("=" * 20) print(f"Repeat: {args.repeat}, mean: {mean_score:.3f}") print(f"Scores: {scores_repeat}") + print(f"Mean latency: {mean_latency:.3f} s") print("=" * 20) metrics = result.metrics | {"scores": scores_repeat} metrics = metrics | {"mean_score": mean_score} + metrics["latency"] = mean_latency + + if total_completion_tokens > 0 and total_latency > 0: + metrics["output_throughput"] = total_completion_tokens / total_latency + print(f"Output throughput: {metrics['output_throughput']:.3f} token/s") # Report metrics to unified collection framework dump_metric( diff --git a/python/sglang/test/simple_eval_common.py b/python/sglang/test/simple_eval_common.py index b7ac713e5c71..e3b96ef81363 100644 --- a/python/sglang/test/simple_eval_common.py +++ b/python/sglang/test/simple_eval_common.py @@ -109,6 +109,7 @@ def __init__( self.reasoning_effort = reasoning_effort self.extra_body = extra_body self.image_format = "url" + self._completion_tokens: list[int] = [] print( f"ChatCompletionSampler initialized with {self.system_message=} {self.temperature=} {self.max_tokens=} {self.reasoning_effort=} {self.extra_body=}" ) @@ -151,6 +152,8 @@ def __call__(self, message_list: MessageList) -> str: reasoning_effort=self.reasoning_effort, extra_body=self.extra_body, ) + if response.usage and response.usage.completion_tokens is not None: + self._completion_tokens.append(response.usage.completion_tokens) return response.choices[0].message.content or "" # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU except openai.BadRequestError as e: