sgl-project · hnyls2002 · Apr 1, 2026 · Mar 31, 2026 · Apr 1, 2026
diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py
@@ -179,9 +179,16 @@ def run_eval(args):
     if getattr(args, "repeat", 1) == 1:
         result, latency, sampler = run_eval_once(args, base_url, eval_obj)
         metrics = result.metrics | {"score": result.score}
+        metrics["latency"] = latency
         print(f"Total latency: {latency:.3f} s")
         print(f"Score: {metrics['score']:.3f}")
 
+        # Compute output throughput from accumulated completion tokens
+        total_completion_tokens = sum(sampler._completion_tokens)
+        if total_completion_tokens > 0 and latency > 0:
+            metrics["output_throughput"] = total_completion_tokens / latency
+            print(f"Output throughput: {metrics['output_throughput']:.3f} token/s")
+
         # Report metrics to unified collection framework
         dump_metric(
             f"{args.eval_name}_score",
@@ -204,19 +211,31 @@ def run_eval(args):
         ]
 
         scores_repeat = []
+        latencies = []
+        total_completion_tokens = 0
 
         for f in futures:
             result, latency, sampler = f.result()
             scores_repeat.append(result.score)
+            latencies.append(latency)
+            total_completion_tokens += sum(sampler._completion_tokens)
 
         mean_score = sum(scores_repeat) / len(scores_repeat)
+        mean_latency = sum(latencies) / len(latencies)
+        total_latency = sum(latencies)
         scores_repeat = [f"{s:.3f}" for s in scores_repeat]
         print("=" * 20)
         print(f"Repeat: {args.repeat}, mean: {mean_score:.3f}")
         print(f"Scores: {scores_repeat}")
+        print(f"Mean latency: {mean_latency:.3f} s")
         print("=" * 20)
         metrics = result.metrics | {"scores": scores_repeat}
         metrics = metrics | {"mean_score": mean_score}
+        metrics["latency"] = mean_latency
+
+        if total_completion_tokens > 0 and total_latency > 0:
+            metrics["output_throughput"] = total_completion_tokens / total_latency
+            print(f"Output throughput: {metrics['output_throughput']:.3f} token/s")
 
         # Report metrics to unified collection framework
         dump_metric(

diff --git a/python/sglang/test/simple_eval_common.py b/python/sglang/test/simple_eval_common.py
@@ -109,6 +109,7 @@ def __init__(
         self.reasoning_effort = reasoning_effort
         self.extra_body = extra_body
         self.image_format = "url"
+        self._completion_tokens: list[int] = []
         print(
             f"ChatCompletionSampler initialized with {self.system_message=} {self.temperature=} {self.max_tokens=} {self.reasoning_effort=} {self.extra_body=}"
         )
@@ -151,6 +152,8 @@ def __call__(self, message_list: MessageList) -> str:
                     reasoning_effort=self.reasoning_effort,
                     extra_body=self.extra_body,
                 )
+                if response.usage and response.usage.completion_tokens is not None:
+                    self._completion_tokens.append(response.usage.completion_tokens)
                 return response.choices[0].message.content or ""
             # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
             except openai.BadRequestError as e: