From bae69f48af924341dcd0e3890832fae4b341613f Mon Sep 17 00:00:00 2001 From: shuxiguo Date: Tue, 9 Dec 2025 21:18:20 +0800 Subject: [PATCH 1/3] support batch compare --- .../multimodal_gen/benchmarks/compare_perf.py | 205 ++++++++++++------ .../multimodal_gen/docs/contributing.md | 2 +- 2 files changed, 136 insertions(+), 71 deletions(-) diff --git a/python/sglang/multimodal_gen/benchmarks/compare_perf.py b/python/sglang/multimodal_gen/benchmarks/compare_perf.py index 2dfb087c79d2..84681e653d11 100644 --- a/python/sglang/multimodal_gen/benchmarks/compare_perf.py +++ b/python/sglang/multimodal_gen/benchmarks/compare_perf.py @@ -1,5 +1,6 @@ import argparse import json +import os import re from datetime import datetime from typing import Any, Dict, List, Tuple @@ -109,108 +110,172 @@ def _load_benchmark_file(file_path: str) -> Dict[str, Any]: def compare_benchmarks( - baseline_path: str, new_path: str, output_format: str = "markdown" + file_paths: List[str], output_format: str = "markdown" ): """ - Compares two benchmark JSON files and prints a report. + Compares benchmark JSON files and prints a report. + First file is baseline, others are compared against it. """ + if len(file_paths) < 2: + print("Error: Need at least 2 files to compare.") + return + try: - base_data = _load_benchmark_file(baseline_path) - new_data = _load_benchmark_file(new_path) + data_list = [_load_benchmark_file(f) for f in file_paths] except Exception as e: print(f"Error loading benchmark files: {e}") return - base_e2e = base_data.get("total_duration_ms", 0) - new_e2e = new_data.get("total_duration_ms", 0) + base_data = data_list[0] + others_data = data_list[1:] - diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e) - - if diff_pct < -2.0: - status = "✅" - elif diff_pct > 2.0: - status = "❌" - else: - status = "" + # Use filenames as labels if multiple comparisons, else just "New" + other_labels = [os.path.basename(p) for p in file_paths[1:]] - # --- Stage Breakdown --- + # --- E2E Summary --- + base_e2e = base_data.get("total_duration_ms", 0) + + # --- Stage Breakdown Pre-calculation --- base_durations, base_order, base_counts = consolidate_steps( base_data.get("steps", []) ) - new_durations, new_order, new_counts = consolidate_steps(new_data.get("steps", [])) - - # Merge orders: Start with New order (execution order), append any missing from Base - combined_order = list(new_order) + + others_processed = [] + for d in others_data: + dur, order, counts = consolidate_steps(d.get("steps", [])) + others_processed.append((dur, order, counts)) + + # Merge orders: Start with last New order (likely most updated), then others, then Base + combined_order = [] + # Collect all unique stages maintaining order from newest to baseline + for _, order, _ in reversed(others_processed): + for name in order: + if name not in combined_order: + combined_order.append(name) for name in base_order: if name not in combined_order: combined_order.append(name) - stage_rows = [] - for stage in combined_order: - b_val = base_durations.get(stage, 0.0) - n_val = new_durations.get(stage, 0.0) - b_count = base_counts.get(stage, 1) - n_count = new_counts.get(stage, 1) - - s_diff, s_pct = calculate_diff(b_val, n_val) - - # Format count string if aggregated - count_str = "" - if stage == "Denoising Loop": - count_str = ( - f" ({n_count} steps)" - if n_count == b_count - else f" ({b_count}->{n_count} steps)" - ) - - # filter noise: show if diff is > 0.5ms OR if it's a major stage (like Denoising Loop) - # always show Denoising Loop or stages with significant duration/diff - stage_rows.append((stage + count_str, b_val, n_val, s_diff, s_pct)) - if output_format == "markdown": print("### Performance Comparison Report\n") - - # Summary Table - print("#### 1. High-level Summary") - print("| Metric | Baseline | New | Diff | Status |") - print("| :--- | :--- | :--- | :--- | :--- |") - print( - f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |" - ) - print( - f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |" - ) - print("\n") - - # Detailed Breakdown - print("#### 2. Stage Breakdown") - print( - "| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |" - ) - print("| :--- | :--- | :--- | :--- | :--- | :--- |") - for name, b, n, d, p in stage_rows: - name_str = name - status_emoji = get_perf_status_emoji(b, n) + + # --- MODE 1: Single Comparison (Detailed) --- + if len(others_data) == 1: + new_data = others_data[0] + new_e2e = new_data.get("total_duration_ms", 0) + diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e) + + if diff_pct < -2.0: + status = "✅" + elif diff_pct > 2.0: + status = "❌" + else: + status = "" + + print("#### 1. High-level Summary") + print("| Metric | Baseline | New | Diff | Status |") + print("| :--- | :--- | :--- | :--- | :--- |") print( - f"| {name_str} | {b:.2f} | {n:.2f} | {d:+.2f} | {p:+.1f}% | {status_emoji} |" + f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |" ) - print("\n") + print( + f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |" + ) + print("\n") + + print("#### 2. Stage Breakdown") + print("| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |") + print("| :--- | :--- | :--- | :--- | :--- | :--- |") + + new_durations, _, new_counts = others_processed[0] + + for stage in combined_order: + b_val = base_durations.get(stage, 0.0) + n_val = new_durations.get(stage, 0.0) + b_count = base_counts.get(stage, 1) + n_count = new_counts.get(stage, 1) + + s_diff, s_pct = calculate_diff(b_val, n_val) + + count_str = "" + if stage == "Denoising Loop": + count_str = ( + f" ({n_count} steps)" + if n_count == b_count + else f" ({b_count}->{n_count} steps)" + ) + + status_emoji = get_perf_status_emoji(b_val, n_val) + print( + f"| {stage}{count_str} | {b_val:.2f} | {n_val:.2f} | {s_diff:+.2f} | {s_pct:+.1f}% | {status_emoji} |" + ) + + # --- MODE 2: Multi Comparison (Compact) --- + else: + print("#### 1. High-level Summary") + # Header + header = "| Metric | Baseline | " + " | ".join(other_labels) + " |" + sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |" + print(header) + print(sep) + + # E2E Row + row_e2e = f"| **E2E Latency** | {base_e2e:.2f} ms |" + for i, d in enumerate(others_data): + val = d.get("total_duration_ms", 0) + diff_ms, diff_pct = calculate_diff(base_e2e, val) + + if diff_pct < -2.0: + status = "✅" + elif diff_pct > 2.0: + status = "❌" + else: + status = "" # or ⚪️ + + row_e2e += f" {val:.2f} ms ({diff_pct:+.1f}%) {status} |" + print(row_e2e) + print("\n") + + print("#### 2. Stage Breakdown") + # Header: Stage | Baseline | Label1 | Label2 ... + header = "| Stage Name | Baseline | " + " | ".join(other_labels) + " |" + sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |" + print(header) + print(sep) + + for stage in combined_order: + b_val = base_durations.get(stage, 0.0) + row_str = f"| {stage} | {b_val:.2f} |" + + for i, (n_durations, _, n_counts) in enumerate(others_processed): + n_val = n_durations.get(stage, 0.0) + _, s_pct = calculate_diff(b_val, n_val) + status_emoji = get_perf_status_emoji(b_val, n_val) + + row_str += f" {n_val:.2f} ({s_pct:+.1f}%) {status_emoji} |" + print(row_str) + print("\n") # Metadata print("
") print("Metadata\n") print(f"- Baseline Commit: `{base_data.get('commit_hash', 'N/A')}`") - print(f"- New Commit: `{new_data.get('commit_hash', 'N/A')}`") + for i, d in enumerate(others_data): + label = "New" if len(others_data) == 1 else other_labels[i] + print(f"- {label} Commit: `{d.get('commit_hash', 'N/A')}`") print(f"- Timestamp: {datetime.now().isoformat()}") print("
") if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Compare two sglang-diffusion performance JSON files." + description="Compare sglang-diffusion performance JSON files." + ) + parser.add_argument( + "files", + nargs="+", + help="List of JSON files. First is baseline, others are compared against it." ) - parser.add_argument("baseline", help="Path to the baseline JSON file") - parser.add_argument("new", help="Path to the new JSON file") args = parser.parse_args() - compare_benchmarks(args.baseline, args.new) + compare_benchmarks(args.files) diff --git a/python/sglang/multimodal_gen/docs/contributing.md b/python/sglang/multimodal_gen/docs/contributing.md index fb8b4456b421..33a4699b8fa2 100644 --- a/python/sglang/multimodal_gen/docs/contributing.md +++ b/python/sglang/multimodal_gen/docs/contributing.md @@ -39,7 +39,7 @@ For PRs that impact **latency**, **throughput**, or **memory usage**, you **shou 3. **Compare**: run the compare script, which will print a Markdown table to the console ```bash - $ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json + $ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json [new2.json ...] ### Performance Comparison Report ... ``` From 5672c510b37b39d49a13c44f5b6ecb085699cb02 Mon Sep 17 00:00:00 2001 From: Mick Date: Tue, 9 Dec 2025 22:44:35 +0800 Subject: [PATCH 2/3] upd --- .../multimodal_gen/benchmarks/compare_perf.py | 213 +++++++++--------- 1 file changed, 109 insertions(+), 104 deletions(-) diff --git a/python/sglang/multimodal_gen/benchmarks/compare_perf.py b/python/sglang/multimodal_gen/benchmarks/compare_perf.py index 84681e653d11..e11e0dd80749 100644 --- a/python/sglang/multimodal_gen/benchmarks/compare_perf.py +++ b/python/sglang/multimodal_gen/benchmarks/compare_perf.py @@ -109,12 +109,109 @@ def _load_benchmark_file(file_path: str) -> Dict[str, Any]: return json.load(f) +def _get_status_emoji_from_diff_percent(diff_pct): + if diff_pct < -2.0: + return "✅" + elif diff_pct > 2.0: + return "❌" + else: + return "⚪️" + + +def _print_single_comparison_report(others_data, base_e2e, combined_order, base_durations, others_processed, + base_counts): + # mode 1: single comparison + new_data = others_data[0] + new_e2e = new_data.get("total_duration_ms", 0) + diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e) + + status = _get_status_emoji_from_diff_percent(diff_pct) + print("#### 1. High-level Summary") + print("| Metric | Baseline | New | Diff | Status |") + print("| :--- | :--- | :--- | :--- | :--- |") + print( + f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |" + ) + print( + f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |" + ) + print("\n") + + print("#### 2. Stage Breakdown") + print("| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |") + print("| :--- | :--- | :--- | :--- | :--- | :--- |") + + new_durations, _, new_counts = others_processed[0] + + for stage in combined_order: + b_val = base_durations.get(stage, 0.0) + n_val = new_durations.get(stage, 0.0) + b_count = base_counts.get(stage, 1) + n_count = new_counts.get(stage, 1) + + s_diff, s_pct = calculate_diff(b_val, n_val) + + count_str = "" + if stage == "Denoising Loop": + count_str = ( + f" ({n_count} steps)" + if n_count == b_count + else f" ({b_count}->{n_count} steps)" + ) + + status_emoji = get_perf_status_emoji(b_val, n_val) + print( + f"| {stage}{count_str} | {b_val:.2f} | {n_val:.2f} | {s_diff:+.2f} | {s_pct:+.1f}% | {status_emoji} |" + ) + + +def _print_multi_comparison_report(base_e2e, others_data, other_labels, combined_order, base_durations, + others_processed): + print("#### 1. High-level Summary") + # Header + header = "| Metric | Baseline | " + " | ".join(other_labels) + " |" + sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |" + print(header) + print(sep) + + # E2E Row + row_e2e = f"| **E2E Latency** | {base_e2e:.2f} ms |" + for i, d in enumerate(others_data): + val = d.get("total_duration_ms", 0) + diff_ms, diff_pct = calculate_diff(base_e2e, val) + + status = _get_status_emoji_from_diff_percent(diff_pct) + + row_e2e += f" {val:.2f} ms ({diff_pct:+.1f}%) {status} |" + print(row_e2e) + print("\n") + + print("#### 2. Stage Breakdown") + # Header: Stage | Baseline | Label1 | Label2 ... + header = "| Stage Name | Baseline | " + " | ".join(other_labels) + " |" + sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |" + print(header) + print(sep) + + for stage in combined_order: + b_val = base_durations.get(stage, 0.0) + row_str = f"| {stage} | {b_val:.2f} |" + + for i, (n_durations, _, n_counts) in enumerate(others_processed): + n_val = n_durations.get(stage, 0.0) + _, s_pct = calculate_diff(b_val, n_val) + status_emoji = get_perf_status_emoji(b_val, n_val) + + row_str += f" {n_val:.2f} ({s_pct:+.1f}%) {status_emoji} |" + print(row_str) + + def compare_benchmarks( file_paths: List[str], output_format: str = "markdown" ): """ Compares benchmark JSON files and prints a report. - First file is baseline, others are compared against it. + First file is baseline, others will be compared against it. """ if len(file_paths) < 2: print("Error: Need at least 2 files to compare.") @@ -132,20 +229,17 @@ def compare_benchmarks( # Use filenames as labels if multiple comparisons, else just "New" other_labels = [os.path.basename(p) for p in file_paths[1:]] - # --- E2E Summary --- base_e2e = base_data.get("total_duration_ms", 0) - - # --- Stage Breakdown Pre-calculation --- + base_durations, base_order, base_counts = consolidate_steps( base_data.get("steps", []) ) - + others_processed = [] for d in others_data: dur, order, counts = consolidate_steps(d.get("steps", [])) others_processed.append((dur, order, counts)) - # Merge orders: Start with last New order (likely most updated), then others, then Base combined_order = [] # Collect all unique stages maintaining order from newest to baseline for _, order, _ in reversed(others_processed): @@ -158,102 +252,13 @@ def compare_benchmarks( if output_format == "markdown": print("### Performance Comparison Report\n") - - # --- MODE 1: Single Comparison (Detailed) --- + if len(others_data) == 1: - new_data = others_data[0] - new_e2e = new_data.get("total_duration_ms", 0) - diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e) - - if diff_pct < -2.0: - status = "✅" - elif diff_pct > 2.0: - status = "❌" - else: - status = "" - - print("#### 1. High-level Summary") - print("| Metric | Baseline | New | Diff | Status |") - print("| :--- | :--- | :--- | :--- | :--- |") - print( - f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |" - ) - print( - f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |" - ) - print("\n") - - print("#### 2. Stage Breakdown") - print("| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |") - print("| :--- | :--- | :--- | :--- | :--- | :--- |") - - new_durations, _, new_counts = others_processed[0] - - for stage in combined_order: - b_val = base_durations.get(stage, 0.0) - n_val = new_durations.get(stage, 0.0) - b_count = base_counts.get(stage, 1) - n_count = new_counts.get(stage, 1) - - s_diff, s_pct = calculate_diff(b_val, n_val) - - count_str = "" - if stage == "Denoising Loop": - count_str = ( - f" ({n_count} steps)" - if n_count == b_count - else f" ({b_count}->{n_count} steps)" - ) - - status_emoji = get_perf_status_emoji(b_val, n_val) - print( - f"| {stage}{count_str} | {b_val:.2f} | {n_val:.2f} | {s_diff:+.2f} | {s_pct:+.1f}% | {status_emoji} |" - ) - - # --- MODE 2: Multi Comparison (Compact) --- + _print_single_comparison_report(others_data, base_e2e, combined_order, base_durations, others_processed, + base_counts) else: - print("#### 1. High-level Summary") - # Header - header = "| Metric | Baseline | " + " | ".join(other_labels) + " |" - sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |" - print(header) - print(sep) - - # E2E Row - row_e2e = f"| **E2E Latency** | {base_e2e:.2f} ms |" - for i, d in enumerate(others_data): - val = d.get("total_duration_ms", 0) - diff_ms, diff_pct = calculate_diff(base_e2e, val) - - if diff_pct < -2.0: - status = "✅" - elif diff_pct > 2.0: - status = "❌" - else: - status = "" # or ⚪️ - - row_e2e += f" {val:.2f} ms ({diff_pct:+.1f}%) {status} |" - print(row_e2e) - print("\n") - - print("#### 2. Stage Breakdown") - # Header: Stage | Baseline | Label1 | Label2 ... - header = "| Stage Name | Baseline | " + " | ".join(other_labels) + " |" - sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |" - print(header) - print(sep) - - for stage in combined_order: - b_val = base_durations.get(stage, 0.0) - row_str = f"| {stage} | {b_val:.2f} |" - - for i, (n_durations, _, n_counts) in enumerate(others_processed): - n_val = n_durations.get(stage, 0.0) - _, s_pct = calculate_diff(b_val, n_val) - status_emoji = get_perf_status_emoji(b_val, n_val) - - row_str += f" {n_val:.2f} ({s_pct:+.1f}%) {status_emoji} |" - print(row_str) + _print_multi_comparison_report(base_e2e, others_data, other_labels, combined_order, base_durations, + others_processed) print("\n") # Metadata @@ -261,8 +266,8 @@ def compare_benchmarks( print("Metadata\n") print(f"- Baseline Commit: `{base_data.get('commit_hash', 'N/A')}`") for i, d in enumerate(others_data): - label = "New" if len(others_data) == 1 else other_labels[i] - print(f"- {label} Commit: `{d.get('commit_hash', 'N/A')}`") + label = "New" if len(others_data) == 1 else other_labels[i] + print(f"- {label} Commit: `{d.get('commit_hash', 'N/A')}`") print(f"- Timestamp: {datetime.now().isoformat()}") print("") @@ -272,8 +277,8 @@ def compare_benchmarks( description="Compare sglang-diffusion performance JSON files." ) parser.add_argument( - "files", - nargs="+", + "files", + nargs="+", help="List of JSON files. First is baseline, others are compared against it." ) args = parser.parse_args() From 4a64ac7c1c60748be7104ab97ffc491fbfc83576 Mon Sep 17 00:00:00 2001 From: Mick Date: Tue, 9 Dec 2025 22:47:12 +0800 Subject: [PATCH 3/3] upd --- .../multimodal_gen/benchmarks/compare_perf.py | 45 ++++++++++++------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/python/sglang/multimodal_gen/benchmarks/compare_perf.py b/python/sglang/multimodal_gen/benchmarks/compare_perf.py index e11e0dd80749..d600d06d2b10 100644 --- a/python/sglang/multimodal_gen/benchmarks/compare_perf.py +++ b/python/sglang/multimodal_gen/benchmarks/compare_perf.py @@ -118,14 +118,14 @@ def _get_status_emoji_from_diff_percent(diff_pct): return "⚪️" -def _print_single_comparison_report(others_data, base_e2e, combined_order, base_durations, others_processed, - base_counts): - # mode 1: single comparison +def _print_single_comparison_report( + others_data, base_e2e, combined_order, base_durations, others_processed, base_counts +): new_data = others_data[0] new_e2e = new_data.get("total_duration_ms", 0) diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e) - status = _get_status_emoji_from_diff_percent(diff_pct) + print("#### 1. High-level Summary") print("| Metric | Baseline | New | Diff | Status |") print("| :--- | :--- | :--- | :--- | :--- |") @@ -165,10 +165,15 @@ def _print_single_comparison_report(others_data, base_e2e, combined_order, base_ ) -def _print_multi_comparison_report(base_e2e, others_data, other_labels, combined_order, base_durations, - others_processed): +def _print_multi_comparison_report( + base_e2e, + others_data, + other_labels, + combined_order, + base_durations, + others_processed, +): print("#### 1. High-level Summary") - # Header header = "| Metric | Baseline | " + " | ".join(other_labels) + " |" sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |" print(header) @@ -206,9 +211,7 @@ def _print_multi_comparison_report(base_e2e, others_data, other_labels, combined print(row_str) -def compare_benchmarks( - file_paths: List[str], output_format: str = "markdown" -): +def compare_benchmarks(file_paths: List[str], output_format: str = "markdown"): """ Compares benchmark JSON files and prints a report. First file is baseline, others will be compared against it. @@ -254,11 +257,23 @@ def compare_benchmarks( print("### Performance Comparison Report\n") if len(others_data) == 1: - _print_single_comparison_report(others_data, base_e2e, combined_order, base_durations, others_processed, - base_counts) + _print_single_comparison_report( + others_data, + base_e2e, + combined_order, + base_durations, + others_processed, + base_counts, + ) else: - _print_multi_comparison_report(base_e2e, others_data, other_labels, combined_order, base_durations, - others_processed) + _print_multi_comparison_report( + base_e2e, + others_data, + other_labels, + combined_order, + base_durations, + others_processed, + ) print("\n") # Metadata @@ -279,7 +294,7 @@ def compare_benchmarks( parser.add_argument( "files", nargs="+", - help="List of JSON files. First is baseline, others are compared against it." + help="List of JSON files. First is baseline, others are compared against it.", ) args = parser.parse_args()