From bae69f48af924341dcd0e3890832fae4b341613f Mon Sep 17 00:00:00 2001
From: shuxiguo <shuxiguo@meituan.com>
Date: Tue, 9 Dec 2025 21:18:20 +0800
Subject: [PATCH 1/3] support batch compare

---
 .../multimodal_gen/benchmarks/compare_perf.py | 205 ++++++++++++------
 .../multimodal_gen/docs/contributing.md       |   2 +-
 2 files changed, 136 insertions(+), 71 deletions(-)

diff --git a/python/sglang/multimodal_gen/benchmarks/compare_perf.py b/python/sglang/multimodal_gen/benchmarks/compare_perf.py
index 2dfb087c79d2..84681e653d11 100644
--- a/python/sglang/multimodal_gen/benchmarks/compare_perf.py
+++ b/python/sglang/multimodal_gen/benchmarks/compare_perf.py
@@ -1,5 +1,6 @@
 import argparse
 import json
+import os
 import re
 from datetime import datetime
 from typing import Any, Dict, List, Tuple
@@ -109,108 +110,172 @@ def _load_benchmark_file(file_path: str) -> Dict[str, Any]:
 
 
 def compare_benchmarks(
-    baseline_path: str, new_path: str, output_format: str = "markdown"
+    file_paths: List[str], output_format: str = "markdown"
 ):
     """
-    Compares two benchmark JSON files and prints a report.
+    Compares benchmark JSON files and prints a report.
+    First file is baseline, others are compared against it.
     """
+    if len(file_paths) < 2:
+        print("Error: Need at least 2 files to compare.")
+        return
+
     try:
-        base_data = _load_benchmark_file(baseline_path)
-        new_data = _load_benchmark_file(new_path)
+        data_list = [_load_benchmark_file(f) for f in file_paths]
     except Exception as e:
         print(f"Error loading benchmark files: {e}")
         return
 
-    base_e2e = base_data.get("total_duration_ms", 0)
-    new_e2e = new_data.get("total_duration_ms", 0)
+    base_data = data_list[0]
+    others_data = data_list[1:]
 
-    diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e)
-
-    if diff_pct < -2.0:
-        status = "✅"
-    elif diff_pct > 2.0:
-        status = "❌"
-    else:
-        status = ""
+    # Use filenames as labels if multiple comparisons, else just "New"
+    other_labels = [os.path.basename(p) for p in file_paths[1:]]
 
-    # --- Stage Breakdown ---
+    # --- E2E Summary ---
+    base_e2e = base_data.get("total_duration_ms", 0)
+    
+    # --- Stage Breakdown Pre-calculation ---
     base_durations, base_order, base_counts = consolidate_steps(
         base_data.get("steps", [])
     )
-    new_durations, new_order, new_counts = consolidate_steps(new_data.get("steps", []))
-
-    # Merge orders: Start with New order (execution order), append any missing from Base
-    combined_order = list(new_order)
+    
+    others_processed = []
+    for d in others_data:
+        dur, order, counts = consolidate_steps(d.get("steps", []))
+        others_processed.append((dur, order, counts))
+
+    # Merge orders: Start with last New order (likely most updated), then others, then Base
+    combined_order = []
+    # Collect all unique stages maintaining order from newest to baseline
+    for _, order, _ in reversed(others_processed):
+        for name in order:
+            if name not in combined_order:
+                combined_order.append(name)
     for name in base_order:
         if name not in combined_order:
             combined_order.append(name)
 
-    stage_rows = []
-    for stage in combined_order:
-        b_val = base_durations.get(stage, 0.0)
-        n_val = new_durations.get(stage, 0.0)
-        b_count = base_counts.get(stage, 1)
-        n_count = new_counts.get(stage, 1)
-
-        s_diff, s_pct = calculate_diff(b_val, n_val)
-
-        # Format count string if aggregated
-        count_str = ""
-        if stage == "Denoising Loop":
-            count_str = (
-                f" ({n_count} steps)"
-                if n_count == b_count
-                else f" ({b_count}->{n_count} steps)"
-            )
-
-        # filter noise: show if diff is > 0.5ms OR if it's a major stage (like Denoising Loop)
-        # always show Denoising Loop or stages with significant duration/diff
-        stage_rows.append((stage + count_str, b_val, n_val, s_diff, s_pct))
-
     if output_format == "markdown":
         print("### Performance Comparison Report\n")
-
-        # Summary Table
-        print("#### 1. High-level Summary")
-        print("| Metric | Baseline | New | Diff | Status |")
-        print("| :--- | :--- | :--- | :--- | :--- |")
-        print(
-            f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |"
-        )
-        print(
-            f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |"
-        )
-        print("\n")
-
-        # Detailed Breakdown
-        print("#### 2. Stage Breakdown")
-        print(
-            "| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |"
-        )
-        print("| :--- | :--- | :--- | :--- | :--- | :--- |")
-        for name, b, n, d, p in stage_rows:
-            name_str = name
-            status_emoji = get_perf_status_emoji(b, n)
+        
+        # --- MODE 1: Single Comparison (Detailed) ---
+        if len(others_data) == 1:
+            new_data = others_data[0]
+            new_e2e = new_data.get("total_duration_ms", 0)
+            diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e)
+            
+            if diff_pct < -2.0:
+                status = "✅"
+            elif diff_pct > 2.0:
+                status = "❌"
+            else:
+                status = ""
+
+            print("#### 1. High-level Summary")
+            print("| Metric | Baseline | New | Diff | Status |")
+            print("| :--- | :--- | :--- | :--- | :--- |")
             print(
-                f"| {name_str} | {b:.2f} | {n:.2f} | {d:+.2f} | {p:+.1f}% | {status_emoji} |"
+                f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |"
             )
-        print("\n")
+            print(
+                f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |"
+            )
+            print("\n")
+
+            print("#### 2. Stage Breakdown")
+            print("| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |")
+            print("| :--- | :--- | :--- | :--- | :--- | :--- |")
+            
+            new_durations, _, new_counts = others_processed[0]
+            
+            for stage in combined_order:
+                b_val = base_durations.get(stage, 0.0)
+                n_val = new_durations.get(stage, 0.0)
+                b_count = base_counts.get(stage, 1)
+                n_count = new_counts.get(stage, 1)
+
+                s_diff, s_pct = calculate_diff(b_val, n_val)
+                
+                count_str = ""
+                if stage == "Denoising Loop":
+                    count_str = (
+                        f" ({n_count} steps)"
+                        if n_count == b_count
+                        else f" ({b_count}->{n_count} steps)"
+                    )
+                
+                status_emoji = get_perf_status_emoji(b_val, n_val)
+                print(
+                    f"| {stage}{count_str} | {b_val:.2f} | {n_val:.2f} | {s_diff:+.2f} | {s_pct:+.1f}% | {status_emoji} |"
+                )
+
+        # --- MODE 2: Multi Comparison (Compact) ---
+        else:
+            print("#### 1. High-level Summary")
+            # Header
+            header = "| Metric | Baseline | " + " | ".join(other_labels) + " |"
+            sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |"
+            print(header)
+            print(sep)
+            
+            # E2E Row
+            row_e2e = f"| **E2E Latency** | {base_e2e:.2f} ms |"
+            for i, d in enumerate(others_data):
+                val = d.get("total_duration_ms", 0)
+                diff_ms, diff_pct = calculate_diff(base_e2e, val)
+                
+                if diff_pct < -2.0:
+                    status = "✅"
+                elif diff_pct > 2.0:
+                    status = "❌"
+                else:
+                    status = "" # or ⚪️
+                
+                row_e2e += f" {val:.2f} ms ({diff_pct:+.1f}%) {status} |"
+            print(row_e2e)
+            print("\n")
+
+            print("#### 2. Stage Breakdown")
+            # Header: Stage | Baseline | Label1 | Label2 ...
+            header = "| Stage Name | Baseline | " + " | ".join(other_labels) + " |"
+            sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |"
+            print(header)
+            print(sep)
+
+            for stage in combined_order:
+                b_val = base_durations.get(stage, 0.0)
+                row_str = f"| {stage} | {b_val:.2f} |"
+                
+                for i, (n_durations, _, n_counts) in enumerate(others_processed):
+                    n_val = n_durations.get(stage, 0.0)
+                    _, s_pct = calculate_diff(b_val, n_val)
+                    status_emoji = get_perf_status_emoji(b_val, n_val)
+                    
+                    row_str += f" {n_val:.2f} ({s_pct:+.1f}%) {status_emoji} |"
+                print(row_str)
 
+        print("\n")
         # Metadata
         print("<details>")
         print("<summary>Metadata</summary>\n")
         print(f"- Baseline Commit: `{base_data.get('commit_hash', 'N/A')}`")
-        print(f"- New Commit: `{new_data.get('commit_hash', 'N/A')}`")
+        for i, d in enumerate(others_data):
+             label = "New" if len(others_data) == 1 else other_labels[i]
+             print(f"- {label} Commit: `{d.get('commit_hash', 'N/A')}`")
         print(f"- Timestamp: {datetime.now().isoformat()}")
         print("</details>")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Compare two sglang-diffusion performance JSON files."
+        description="Compare sglang-diffusion performance JSON files."
+    )
+    parser.add_argument(
+        "files", 
+        nargs="+", 
+        help="List of JSON files. First is baseline, others are compared against it."
     )
-    parser.add_argument("baseline", help="Path to the baseline JSON file")
-    parser.add_argument("new", help="Path to the new JSON file")
     args = parser.parse_args()
 
-    compare_benchmarks(args.baseline, args.new)
+    compare_benchmarks(args.files)
diff --git a/python/sglang/multimodal_gen/docs/contributing.md b/python/sglang/multimodal_gen/docs/contributing.md
index fb8b4456b421..33a4699b8fa2 100644
--- a/python/sglang/multimodal_gen/docs/contributing.md
+++ b/python/sglang/multimodal_gen/docs/contributing.md
@@ -39,7 +39,7 @@ For PRs that impact **latency**, **throughput**, or **memory usage**, you **shou
 
 3.  **Compare**: run the compare script, which will print a Markdown table to the console
     ```bash
-    $ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json
+    $ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json [new2.json ...]
     ### Performance Comparison Report
     ...
     ```

From 5672c510b37b39d49a13c44f5b6ecb085699cb02 Mon Sep 17 00:00:00 2001
From: Mick <mickjagger19@icloud.com>
Date: Tue, 9 Dec 2025 22:44:35 +0800
Subject: [PATCH 2/3] upd

---
 .../multimodal_gen/benchmarks/compare_perf.py | 213 +++++++++---------
 1 file changed, 109 insertions(+), 104 deletions(-)

diff --git a/python/sglang/multimodal_gen/benchmarks/compare_perf.py b/python/sglang/multimodal_gen/benchmarks/compare_perf.py
index 84681e653d11..e11e0dd80749 100644
--- a/python/sglang/multimodal_gen/benchmarks/compare_perf.py
+++ b/python/sglang/multimodal_gen/benchmarks/compare_perf.py
@@ -109,12 +109,109 @@ def _load_benchmark_file(file_path: str) -> Dict[str, Any]:
         return json.load(f)
 
 
+def _get_status_emoji_from_diff_percent(diff_pct):
+    if diff_pct < -2.0:
+        return "✅"
+    elif diff_pct > 2.0:
+        return "❌"
+    else:
+        return "⚪️"
+
+
+def _print_single_comparison_report(others_data, base_e2e, combined_order, base_durations, others_processed,
+                                    base_counts):
+    # mode 1: single comparison
+    new_data = others_data[0]
+    new_e2e = new_data.get("total_duration_ms", 0)
+    diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e)
+
+    status = _get_status_emoji_from_diff_percent(diff_pct)
+    print("#### 1. High-level Summary")
+    print("| Metric | Baseline | New | Diff | Status |")
+    print("| :--- | :--- | :--- | :--- | :--- |")
+    print(
+        f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |"
+    )
+    print(
+        f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |"
+    )
+    print("\n")
+
+    print("#### 2. Stage Breakdown")
+    print("| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |")
+    print("| :--- | :--- | :--- | :--- | :--- | :--- |")
+
+    new_durations, _, new_counts = others_processed[0]
+
+    for stage in combined_order:
+        b_val = base_durations.get(stage, 0.0)
+        n_val = new_durations.get(stage, 0.0)
+        b_count = base_counts.get(stage, 1)
+        n_count = new_counts.get(stage, 1)
+
+        s_diff, s_pct = calculate_diff(b_val, n_val)
+
+        count_str = ""
+        if stage == "Denoising Loop":
+            count_str = (
+                f" ({n_count} steps)"
+                if n_count == b_count
+                else f" ({b_count}->{n_count} steps)"
+            )
+
+        status_emoji = get_perf_status_emoji(b_val, n_val)
+        print(
+            f"| {stage}{count_str} | {b_val:.2f} | {n_val:.2f} | {s_diff:+.2f} | {s_pct:+.1f}% | {status_emoji} |"
+        )
+
+
+def _print_multi_comparison_report(base_e2e, others_data, other_labels, combined_order, base_durations,
+                                   others_processed):
+    print("#### 1. High-level Summary")
+    # Header
+    header = "| Metric | Baseline | " + " | ".join(other_labels) + " |"
+    sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |"
+    print(header)
+    print(sep)
+
+    # E2E Row
+    row_e2e = f"| **E2E Latency** | {base_e2e:.2f} ms |"
+    for i, d in enumerate(others_data):
+        val = d.get("total_duration_ms", 0)
+        diff_ms, diff_pct = calculate_diff(base_e2e, val)
+
+        status = _get_status_emoji_from_diff_percent(diff_pct)
+
+        row_e2e += f" {val:.2f} ms ({diff_pct:+.1f}%) {status} |"
+    print(row_e2e)
+    print("\n")
+
+    print("#### 2. Stage Breakdown")
+    # Header: Stage | Baseline | Label1 | Label2 ...
+    header = "| Stage Name | Baseline | " + " | ".join(other_labels) + " |"
+    sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |"
+    print(header)
+    print(sep)
+
+    for stage in combined_order:
+        b_val = base_durations.get(stage, 0.0)
+        row_str = f"| {stage} | {b_val:.2f} |"
+
+        for i, (n_durations, _, n_counts) in enumerate(others_processed):
+            n_val = n_durations.get(stage, 0.0)
+            _, s_pct = calculate_diff(b_val, n_val)
+            status_emoji = get_perf_status_emoji(b_val, n_val)
+
+            row_str += f" {n_val:.2f} ({s_pct:+.1f}%) {status_emoji} |"
+        print(row_str)
+
+
 def compare_benchmarks(
     file_paths: List[str], output_format: str = "markdown"
 ):
     """
     Compares benchmark JSON files and prints a report.
-    First file is baseline, others are compared against it.
+    First file is baseline, others will be compared against it.
     """
     if len(file_paths) < 2:
         print("Error: Need at least 2 files to compare.")
@@ -132,20 +229,17 @@ def compare_benchmarks(
     # Use filenames as labels if multiple comparisons, else just "New"
     other_labels = [os.path.basename(p) for p in file_paths[1:]]
 
-    # --- E2E Summary ---
     base_e2e = base_data.get("total_duration_ms", 0)
-    
-    # --- Stage Breakdown Pre-calculation ---
+
     base_durations, base_order, base_counts = consolidate_steps(
         base_data.get("steps", [])
     )
-    
+
     others_processed = []
     for d in others_data:
         dur, order, counts = consolidate_steps(d.get("steps", []))
         others_processed.append((dur, order, counts))
 
-    # Merge orders: Start with last New order (likely most updated), then others, then Base
     combined_order = []
     # Collect all unique stages maintaining order from newest to baseline
     for _, order, _ in reversed(others_processed):
@@ -158,102 +252,13 @@ def compare_benchmarks(
 
     if output_format == "markdown":
         print("### Performance Comparison Report\n")
-        
-        # --- MODE 1: Single Comparison (Detailed) ---
+
         if len(others_data) == 1:
-            new_data = others_data[0]
-            new_e2e = new_data.get("total_duration_ms", 0)
-            diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e)
-            
-            if diff_pct < -2.0:
-                status = "✅"
-            elif diff_pct > 2.0:
-                status = "❌"
-            else:
-                status = ""
-
-            print("#### 1. High-level Summary")
-            print("| Metric | Baseline | New | Diff | Status |")
-            print("| :--- | :--- | :--- | :--- | :--- |")
-            print(
-                f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |"
-            )
-            print(
-                f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |"
-            )
-            print("\n")
-
-            print("#### 2. Stage Breakdown")
-            print("| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |")
-            print("| :--- | :--- | :--- | :--- | :--- | :--- |")
-            
-            new_durations, _, new_counts = others_processed[0]
-            
-            for stage in combined_order:
-                b_val = base_durations.get(stage, 0.0)
-                n_val = new_durations.get(stage, 0.0)
-                b_count = base_counts.get(stage, 1)
-                n_count = new_counts.get(stage, 1)
-
-                s_diff, s_pct = calculate_diff(b_val, n_val)
-                
-                count_str = ""
-                if stage == "Denoising Loop":
-                    count_str = (
-                        f" ({n_count} steps)"
-                        if n_count == b_count
-                        else f" ({b_count}->{n_count} steps)"
-                    )
-                
-                status_emoji = get_perf_status_emoji(b_val, n_val)
-                print(
-                    f"| {stage}{count_str} | {b_val:.2f} | {n_val:.2f} | {s_diff:+.2f} | {s_pct:+.1f}% | {status_emoji} |"
-                )
-
-        # --- MODE 2: Multi Comparison (Compact) ---
+            _print_single_comparison_report(others_data, base_e2e, combined_order, base_durations, others_processed,
+                                            base_counts)
         else:
-            print("#### 1. High-level Summary")
-            # Header
-            header = "| Metric | Baseline | " + " | ".join(other_labels) + " |"
-            sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |"
-            print(header)
-            print(sep)
-            
-            # E2E Row
-            row_e2e = f"| **E2E Latency** | {base_e2e:.2f} ms |"
-            for i, d in enumerate(others_data):
-                val = d.get("total_duration_ms", 0)
-                diff_ms, diff_pct = calculate_diff(base_e2e, val)
-                
-                if diff_pct < -2.0:
-                    status = "✅"
-                elif diff_pct > 2.0:
-                    status = "❌"
-                else:
-                    status = "" # or ⚪️
-                
-                row_e2e += f" {val:.2f} ms ({diff_pct:+.1f}%) {status} |"
-            print(row_e2e)
-            print("\n")
-
-            print("#### 2. Stage Breakdown")
-            # Header: Stage | Baseline | Label1 | Label2 ...
-            header = "| Stage Name | Baseline | " + " | ".join(other_labels) + " |"
-            sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |"
-            print(header)
-            print(sep)
-
-            for stage in combined_order:
-                b_val = base_durations.get(stage, 0.0)
-                row_str = f"| {stage} | {b_val:.2f} |"
-                
-                for i, (n_durations, _, n_counts) in enumerate(others_processed):
-                    n_val = n_durations.get(stage, 0.0)
-                    _, s_pct = calculate_diff(b_val, n_val)
-                    status_emoji = get_perf_status_emoji(b_val, n_val)
-                    
-                    row_str += f" {n_val:.2f} ({s_pct:+.1f}%) {status_emoji} |"
-                print(row_str)
+            _print_multi_comparison_report(base_e2e, others_data, other_labels, combined_order, base_durations,
+                                           others_processed)
 
         print("\n")
         # Metadata
@@ -261,8 +266,8 @@ def compare_benchmarks(
         print("<summary>Metadata</summary>\n")
         print(f"- Baseline Commit: `{base_data.get('commit_hash', 'N/A')}`")
         for i, d in enumerate(others_data):
-             label = "New" if len(others_data) == 1 else other_labels[i]
-             print(f"- {label} Commit: `{d.get('commit_hash', 'N/A')}`")
+            label = "New" if len(others_data) == 1 else other_labels[i]
+            print(f"- {label} Commit: `{d.get('commit_hash', 'N/A')}`")
         print(f"- Timestamp: {datetime.now().isoformat()}")
         print("</details>")
 
@@ -272,8 +277,8 @@ def compare_benchmarks(
         description="Compare sglang-diffusion performance JSON files."
     )
     parser.add_argument(
-        "files", 
-        nargs="+", 
+        "files",
+        nargs="+",
         help="List of JSON files. First is baseline, others are compared against it."
     )
     args = parser.parse_args()

From 4a64ac7c1c60748be7104ab97ffc491fbfc83576 Mon Sep 17 00:00:00 2001
From: Mick <mickjagger19@icloud.com>
Date: Tue, 9 Dec 2025 22:47:12 +0800
Subject: [PATCH 3/3] upd

---
 .../multimodal_gen/benchmarks/compare_perf.py | 45 ++++++++++++-------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/python/sglang/multimodal_gen/benchmarks/compare_perf.py b/python/sglang/multimodal_gen/benchmarks/compare_perf.py
index e11e0dd80749..d600d06d2b10 100644
--- a/python/sglang/multimodal_gen/benchmarks/compare_perf.py
+++ b/python/sglang/multimodal_gen/benchmarks/compare_perf.py
@@ -118,14 +118,14 @@ def _get_status_emoji_from_diff_percent(diff_pct):
         return "⚪️"
 
 
-def _print_single_comparison_report(others_data, base_e2e, combined_order, base_durations, others_processed,
-                                    base_counts):
-    # mode 1: single comparison
+def _print_single_comparison_report(
+    others_data, base_e2e, combined_order, base_durations, others_processed, base_counts
+):
     new_data = others_data[0]
     new_e2e = new_data.get("total_duration_ms", 0)
     diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e)
-
     status = _get_status_emoji_from_diff_percent(diff_pct)
+
     print("#### 1. High-level Summary")
     print("| Metric | Baseline | New | Diff | Status |")
     print("| :--- | :--- | :--- | :--- | :--- |")
@@ -165,10 +165,15 @@ def _print_single_comparison_report(others_data, base_e2e, combined_order, base_
         )
 
 
-def _print_multi_comparison_report(base_e2e, others_data, other_labels, combined_order, base_durations,
-                                   others_processed):
+def _print_multi_comparison_report(
+    base_e2e,
+    others_data,
+    other_labels,
+    combined_order,
+    base_durations,
+    others_processed,
+):
     print("#### 1. High-level Summary")
-    # Header
     header = "| Metric | Baseline | " + " | ".join(other_labels) + " |"
     sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |"
     print(header)
@@ -206,9 +211,7 @@ def _print_multi_comparison_report(base_e2e, others_data, other_labels, combined
         print(row_str)
 
 
-def compare_benchmarks(
-    file_paths: List[str], output_format: str = "markdown"
-):
+def compare_benchmarks(file_paths: List[str], output_format: str = "markdown"):
     """
     Compares benchmark JSON files and prints a report.
     First file is baseline, others will be compared against it.
@@ -254,11 +257,23 @@ def compare_benchmarks(
         print("### Performance Comparison Report\n")
 
         if len(others_data) == 1:
-            _print_single_comparison_report(others_data, base_e2e, combined_order, base_durations, others_processed,
-                                            base_counts)
+            _print_single_comparison_report(
+                others_data,
+                base_e2e,
+                combined_order,
+                base_durations,
+                others_processed,
+                base_counts,
+            )
         else:
-            _print_multi_comparison_report(base_e2e, others_data, other_labels, combined_order, base_durations,
-                                           others_processed)
+            _print_multi_comparison_report(
+                base_e2e,
+                others_data,
+                other_labels,
+                combined_order,
+                base_durations,
+                others_processed,
+            )
 
         print("\n")
         # Metadata
@@ -279,7 +294,7 @@ def compare_benchmarks(
     parser.add_argument(
         "files",
         nargs="+",
-        help="List of JSON files. First is baseline, others are compared against it."
+        help="List of JSON files. First is baseline, others are compared against it.",
     )
     args = parser.parse_args()