diff --git a/analysis/scripts/upload_rollup.py b/analysis/scripts/upload_rollup.py new file mode 100644 index 00000000..cb2e3209 --- /dev/null +++ b/analysis/scripts/upload_rollup.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +"""Upload rollup.json files to SRT endpoint as Parquet files grouped by backend/frontend/benchmarker. + +Process: +1. Find all rollup.json files in the directory +2. Create flattened JSON for each concurrency level +3. Combine all into a single Parquet file +4. Group by (backend_type, frontend_type, benchmark_type) +5. Upload separate Parquet file for each group + +Usage: + python upload_rollup.py [--study-id STUDY] [--endpoint URL] + +Example: + python upload_rollup.py ./outputs user@example.com + python upload_rollup.py ./outputs user@example.com --study-id my-study + python upload_rollup.py ./outputs user@example.com --endpoint that accepts the upload +""" + +import argparse +from collections import defaultdict +import gzip +import json +import sys +from pathlib import Path + +import requests + + +DEFAULT_WORKDIR = Path("/tmp/srt") + + +def upload_json( + json_path: Path, + user_login: str, + session_id: str, + endpoint: str, + backend: str, + benchmarker: str, + frontend: str, + mode: str, +) -> tuple[bool, str]: + """Upload a gzipped JSON file to the endpoint. + + Args: + json_path: Path to the JSON file + user_login: User login/email + session_id: Session ID for the upload + endpoint: API endpoint URL + backend: Backend type + benchmarker: Benchmark type + frontend: Frontend type + mode: Mode (disaggregated or aggregated) + + Returns: + Tuple of (success, message) + """ + json_content = json_path.read_bytes() + compressed_content = gzip.compress(json_content) + + # Use .json.gz extension to indicate gzipped JSON + filename = json_path.name + ".gz" + + try: + response = requests.post( + endpoint, + files={"file": (filename, compressed_content, "application/gzip")}, + data={ + "user_login": user_login, + "session_id": session_id, + "backend": backend, + "benchmarker": benchmarker, + "frontend": frontend, + "mode": mode, + }, + timeout=60, + ) + + if response.ok: + return True, f"HTTP {response.status_code}" + else: + return False, f"HTTP {response.status_code}: {response.text}" + + except requests.RequestException as e: + return False, f"Request failed: {e}" + + +def find_rollup_files(directory: Path) -> list[Path]: + """Recursively find all rollup.json files in a directory.""" + return list(directory.rglob("rollup.json")) + + +def read_sbatch_script(rollup_path: Path) -> str | None: + """Read the sbatch_script.sh associated with a rollup.json. + + The sbatch script is expected to be at /sbatch_script.sh + where rollup.json is at /logs/rollup.json. + + Args: + rollup_path: Path to the rollup.json file + + Returns: + Content of sbatch_script.sh or None if not found + """ + # rollup.json is at /logs/rollup.json + # sbatch_script.sh is at /sbatch_script.sh + job_dir = rollup_path.parent.parent + sbatch_path = job_dir / "sbatch_script.sh" + + if sbatch_path.exists(): + try: + return sbatch_path.read_text() + except Exception: + return None + return None + + +def main(): + parser = argparse.ArgumentParser( + description="Upload rollup.json files to SRT endpoint as Parquet (grouped by backend/frontend/benchmarker)" + ) + parser.add_argument("directory", type=Path, help="Directory to search for rollup.json files") + parser.add_argument("user_login", help="User login/email for the upload") + parser.add_argument( + "--study-id", + help="Study ID (default: extracted from first job_name per group)", + ) + parser.add_argument( + "--endpoint", + default="http://localhost:8000", + help="API endpoint (default: http://localhost:8000)", + ) + parser.add_argument( + "--workdir", + type=Path, + help=f"Working directory for output files (default: {DEFAULT_WORKDIR})", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Generate Parquet files but don't upload", + ) + parser.add_argument( + "--keep-files", + action="store_true", + help="Keep generated Parquet files after upload (default: delete)", + ) + + args = parser.parse_args() + + if not args.directory.exists(): + print(f"Error: Directory does not exist: {args.directory}", file=sys.stderr) + sys.exit(1) + + # Setup workdir + workdir = args.workdir if args.workdir else DEFAULT_WORKDIR + workdir.mkdir(parents=True, exist_ok=True) + print(f"Working directory: {workdir}") + + rollup_files = find_rollup_files(args.directory) + + if not rollup_files: + print(f"No rollup.json files found in {args.directory}") + sys.exit(0) + + print(f"Found {len(rollup_files)} rollup.json files") + print(f"User: {args.user_login}") + print(f"Endpoint: {args.endpoint}") + print(f"Study ID: {args.study_id}") + print("---") + + failed_count = 0 + + groups: defaultdict[tuple[str, str, str], list[dict]] = defaultdict(list) + + for rollup_path in sorted(rollup_files): + print(f"Processing: {rollup_path}") + + try: + with open(rollup_path) as f: + data = json.load(f) + except Exception as e: + print(f" ✗ Failed to read: {e}") + failed_count += 1 + continue + + # Skip if no nodes_summary (job likely failed/cancelled) + if not data.get("nodes_summary"): + print(" ⚠ Skipping: no nodes_summary (job may have failed)") + continue + + mode = "aggregated" if data.get("is_aggregated") else "disaggregated" + group = (data['benchmark_type'], data['frontend_type'], data['backend_type'], mode) + # Read sbatch script for this job + sbatch_script = read_sbatch_script(rollup_path) + + # Add sbatch script to each row + if sbatch_script: + data["sbatch_script"] = sbatch_script + groups[group].append(data) + + print("---") + print(f"Total rollups processed: {len(rollup_files)}") + print(f"Total groups: {len(groups)}") + for group, rows in groups.items(): + print(f" {group}: {len(rows)}") + print(f"Failed to read: {failed_count}") + + if not groups: + print("No data to write") + return + + success_count = 0 + upload_failed_count = 0 + + for group, rows in groups.items(): + print(f"\n--- Group: {group} ---") + print(f" Rows: {len(rows)}") + + group_str = "_".join(group) + + group_filename = f"rollup_{group_str}.json" + group_path = workdir / group_filename + with open(group_path, "w") as f: + json.dump(rows, f, indent=1) + print(f" ✓ Created: {group_path}") + + if args.dry_run: + print(" Dry run - skipping upload") + continue + + # Determine study_id - use provided or first job_name in group + if args.study_id: + study_id = args.study_id + else: + study_id = rows[0]["job_name"] + + print(f" Uploading with study_id: {study_id}") + + success, message = upload_json( + group_path, + args.user_login, + study_id, + args.endpoint, + benchmarker=group[0], + frontend=group[1], + backend=group[2], + mode=group[3], + ) + + if success: + print(f" ✓ Uploaded ({message})") + success_count += 1 + else: + print(f" ✗ Upload failed: {message}") + upload_failed_count += 1 + + # Cleanup unless --keep-files + if not args.keep_files and not args.dry_run: + for fp in workdir.glob("*.json"): + try: + fp.unlink() + except Exception: + print(f" ✗ Failed to delete: {fp}") + print(f"\nCleaned up {len(list(workdir.glob('*.json')))} generated files") + + print("\n" + "=" * 50) + print(f"Total groups: {len(groups)}") + if not args.dry_run: + print(f"Successful uploads: {success_count}") + print(f"Failed uploads: {upload_failed_count}") + + if upload_failed_count > 0: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/analysis/srtlog/parsers/__init__.py b/analysis/srtlog/parsers/__init__.py new file mode 100644 index 00000000..e61c1438 --- /dev/null +++ b/analysis/srtlog/parsers/__init__.py @@ -0,0 +1,247 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Parser protocols and registries for benchmark and node log parsing. + +This module provides extensible parsing infrastructure: +- BenchmarkParser: Parses benchmark.out files based on benchmark type +- NodeParser: Parses prefill/decode/agg logs based on backend type + +Usage: + from analysis.srtlog.parsers import get_benchmark_parser, get_node_parser + + # Get parser by type + bench_parser = get_benchmark_parser("sa-bench") + results = bench_parser.parse(benchmark_out_path) + + node_parser = get_node_parser("sglang") + nodes = node_parser.parse_logs(log_dir) +""" + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Protocol + +from analysis.srtlog.models import NodeMetrics + + +@dataclass +class BenchmarkLaunchCommand: + """Parsed benchmark launch command information. + + Source: logs/benchmark.out + + Only contains essential fields. All parsed arguments go into extra_args. + """ + + benchmark_type: str + raw_command: str + + # All parsed arguments as dict + extra_args: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class NodeLaunchCommand: + """Parsed node worker launch command information. + + Source: logs/{node}_{worker_type}_{worker_id}.out or .err + + Only contains essential fields. All parsed arguments go into extra_args. + """ + + backend_type: str + worker_type: str # prefill, decode, agg + raw_command: str + + # All parsed arguments as dict + extra_args: dict[str, Any] = field(default_factory=dict) + + +class BenchmarkParserProtocol(Protocol): + """Protocol for benchmark output parsers. + + Each benchmark type (sa-bench, mooncake-router, etc.) should have + a parser that implements this protocol. + """ + + @property + def benchmark_type(self) -> str: + """Return the benchmark type this parser handles.""" + ... + + def parse(self, benchmark_out_path: Path) -> dict[str, Any]: + """Parse benchmark.out file and return results. + + Args: + benchmark_out_path: Path to the benchmark.out file + + Returns: + Dict with benchmark results including: + - output_tps: Output tokens per second + - mean_ttft_ms: Mean time to first token + - mean_itl_ms: Mean inter-token latency + - etc. + """ + ... + + def parse_launch_command(self, log_content: str) -> BenchmarkLaunchCommand | None: + """Parse the benchmark launch command from log content. + + Args: + log_content: Content of the benchmark log file + + Returns: + BenchmarkLaunchCommand with parsed parameters, or None if not found + """ + ... + + def parse_result_json(self, json_path: Path) -> dict[str, Any]: + """Parse a benchmark result JSON file. + + Args: + json_path: Path to a result JSON file + + Returns: + Dict with parsed benchmark metrics + """ + ... + + +class NodeParserProtocol(Protocol): + """Protocol for node log parsers. + + Each backend type (sglang, trtllm, etc.) should have a parser + that implements this protocol for parsing prefill/decode/agg logs. + """ + + @property + def backend_type(self) -> str: + """Return the backend type this parser handles.""" + ... + + def parse_logs(self, log_dir: Path) -> list[NodeMetrics]: + """Parse all node logs in a directory. + + Args: + log_dir: Directory containing prefill/decode/agg .out/.err files + + Returns: + List of NodeMetrics objects, one per worker + """ + ... + + def parse_single_log(self, log_path: Path) -> NodeMetrics | None: + """Parse a single node log file. + + Args: + log_path: Path to a prefill/decode/agg log file + + Returns: + NodeMetrics object or None if parsing failed + """ + ... + + def parse_launch_command(self, log_content: str, worker_type: str = "unknown") -> NodeLaunchCommand | None: + """Parse the worker launch command from log content. + + Args: + log_content: Content of the worker log file + worker_type: Type of worker (prefill, decode, agg) + + Returns: + NodeLaunchCommand with parsed parameters, or None if not found + """ + ... + + +# Registry for benchmark parsers +_benchmark_parsers: dict[str, type] = {} + +# Registry for node parsers +_node_parsers: dict[str, type] = {} + + +def register_benchmark_parser(benchmark_type: str): + """Decorator to register a benchmark parser. + + Usage: + @register_benchmark_parser("sa-bench") + class SABenchParser: + ... + """ + + def decorator(cls): + _benchmark_parsers[benchmark_type] = cls + return cls + + return decorator + + +def register_node_parser(backend_type: str): + """Decorator to register a node parser. + + Usage: + @register_node_parser("sglang") + class SGLangNodeParser: + ... + """ + + def decorator(cls): + _node_parsers[backend_type] = cls + return cls + + return decorator + + +def get_benchmark_parser(benchmark_type: str) -> BenchmarkParserProtocol: + """Get a benchmark parser by type. + + Args: + benchmark_type: Type of benchmark (e.g., "sa-bench", "mooncake-router") + + Returns: + Instance of the appropriate benchmark parser + + Raises: + ValueError: If no parser registered for the benchmark type + """ + if benchmark_type not in _benchmark_parsers: + available = ", ".join(_benchmark_parsers.keys()) or "none" + raise ValueError(f"No benchmark parser registered for '{benchmark_type}'. Available: {available}") + return _benchmark_parsers[benchmark_type]() + + +def get_node_parser(backend_type: str) -> NodeParserProtocol: + """Get a node parser by backend type. + + Args: + backend_type: Type of backend (e.g., "sglang", "trtllm") + + Returns: + Instance of the appropriate node parser + + Raises: + ValueError: If no parser registered for the backend type + """ + if backend_type not in _node_parsers: + available = ", ".join(_node_parsers.keys()) or "none" + raise ValueError(f"No node parser registered for '{backend_type}'. Available: {available}") + return _node_parsers[backend_type]() + + +def list_benchmark_parsers() -> list[str]: + """List all registered benchmark parser types.""" + return list(_benchmark_parsers.keys()) + + +def list_node_parsers() -> list[str]: + """List all registered node parser types.""" + return list(_node_parsers.keys()) + + +# Import parsers to trigger registration +from analysis.srtlog.parsers.benchmark import * # noqa: E402, F401, F403 +from analysis.srtlog.parsers.nodes import * # noqa: E402, F401, F403 + diff --git a/analysis/srtlog/parsers/benchmark/__init__.py b/analysis/srtlog/parsers/benchmark/__init__.py new file mode 100644 index 00000000..302bed0a --- /dev/null +++ b/analysis/srtlog/parsers/benchmark/__init__.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Benchmark output parsers.""" + +from analysis.srtlog.parsers.benchmark.mooncake_router import MooncakeRouterParser +from analysis.srtlog.parsers.benchmark.sa_bench import SABenchParser + +__all__ = ["SABenchParser", "MooncakeRouterParser"] + diff --git a/analysis/srtlog/parsers/benchmark/mooncake_router.py b/analysis/srtlog/parsers/benchmark/mooncake_router.py new file mode 100644 index 00000000..9f1a7e6c --- /dev/null +++ b/analysis/srtlog/parsers/benchmark/mooncake_router.py @@ -0,0 +1,284 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Mooncake Router benchmark output parser.""" + +from __future__ import annotations + +import json +import logging +import re +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from analysis.srtlog.parsers import register_benchmark_parser + +if TYPE_CHECKING: + from analysis.srtlog.parsers import BenchmarkLaunchCommand + +logger = logging.getLogger(__name__) + + +@register_benchmark_parser("mooncake-router") +class MooncakeRouterParser: + """Parser for Mooncake Router benchmark output. + + Parses benchmark.out files and AIPerf result JSON files from mooncake-router runs. + """ + + @property + def benchmark_type(self) -> str: + return "mooncake-router" + + def parse(self, benchmark_out_path: Path) -> dict[str, Any]: + """Parse benchmark.out file for mooncake-router results. + + Args: + benchmark_out_path: Path to benchmark.out file + + Returns: + Dict with aggregated benchmark results + """ + results = { + "benchmark_type": self.benchmark_type, + "output_tps": None, + "request_throughput": None, + "mean_ttft_ms": None, + "mean_itl_ms": None, + "total_requests": None, + } + + if not benchmark_out_path.exists(): + logger.warning("benchmark.out not found: %s", benchmark_out_path) + return results + + try: + content = benchmark_out_path.read_text() + + # Parse mooncake-router output patterns + # Example: "Request throughput: 3.37 req/s" + # Example: "Output token throughput: 1150.92 tok/s" + req_tpt_pattern = r"[Rr]equest\s+throughput[:\s]+([\d.]+)" + out_tpt_pattern = r"[Oo]utput\s+(?:token\s+)?throughput[:\s]+([\d.]+)" + ttft_pattern = r"[Tt]ime\s+to\s+first\s+token[:\s]+([\d.]+)" + itl_pattern = r"[Ii]nter.?token\s+latency[:\s]+([\d.]+)" + + for line in content.split("\n"): + if req_tpt_match := re.search(req_tpt_pattern, line): + results["request_throughput"] = float(req_tpt_match.group(1)) + if out_tpt_match := re.search(out_tpt_pattern, line): + results["output_tps"] = float(out_tpt_match.group(1)) + if ttft_match := re.search(ttft_pattern, line): + results["mean_ttft_ms"] = float(ttft_match.group(1)) + if itl_match := re.search(itl_pattern, line): + results["mean_itl_ms"] = float(itl_match.group(1)) + + except Exception as e: + logger.warning("Failed to parse benchmark.out: %s", e) + + return results + + def parse_result_json(self, json_path: Path) -> dict[str, Any]: + """Parse an AIPerf result JSON file. + + Args: + json_path: Path to profile_export_aiperf.json + + Returns: + Dict with benchmark metrics + """ + result = {} + + try: + with open(json_path) as f: + data = json.load(f) + + # AIPerf format has nested structure with unit and values + result = { + "concurrency": 0, # Mooncake uses open-loop, no fixed concurrency + # Throughput metrics + "output_tps": self._get_metric(data, "output_token_throughput", "avg"), + "request_throughput": self._get_metric(data, "request_throughput", "avg"), + # Mean latencies (convert from ms) + "mean_ttft_ms": self._get_metric(data, "time_to_first_token", "avg"), + "mean_tpot_ms": self._get_metric(data, "inter_token_latency", "avg"), + "mean_itl_ms": self._get_metric(data, "inter_token_latency", "avg"), + "mean_e2el_ms": self._get_metric(data, "request_latency", "avg"), + # Median latencies + "median_ttft_ms": self._get_metric(data, "time_to_first_token", "p50"), + "median_tpot_ms": self._get_metric(data, "inter_token_latency", "p50"), + "median_itl_ms": self._get_metric(data, "inter_token_latency", "p50"), + "median_e2el_ms": self._get_metric(data, "request_latency", "p50"), + # P99 latencies + "p99_ttft_ms": self._get_metric(data, "time_to_first_token", "p99"), + "p99_tpot_ms": self._get_metric(data, "inter_token_latency", "p99"), + "p99_itl_ms": self._get_metric(data, "inter_token_latency", "p99"), + "p99_e2el_ms": self._get_metric(data, "request_latency", "p99"), + # Std dev latencies + "std_ttft_ms": self._get_metric(data, "time_to_first_token", "std"), + "std_itl_ms": self._get_metric(data, "inter_token_latency", "std"), + "std_e2el_ms": self._get_metric(data, "request_latency", "std"), + # Request count + "completed": self._get_metric(data, "request_count", "avg"), + "num_prompts": self._get_metric(data, "request_count", "avg"), + } + + # Also extract per-user throughput if available + tps_per_user = self._get_metric(data, "output_token_throughput_per_user", "avg") + if tps_per_user: + result["output_tps_per_user"] = tps_per_user + + except Exception as e: + logger.warning("Failed to parse %s: %s", json_path, e) + + return result + + def _get_metric(self, data: dict, metric_name: str, stat: str) -> float | None: + """Extract a metric value from AIPerf data structure. + + Args: + data: AIPerf JSON data + metric_name: Name of the metric (e.g., "time_to_first_token") + stat: Statistic to extract (e.g., "avg", "p50", "p99") + + Returns: + Metric value or None if not found + """ + try: + metric_data = data.get(metric_name, {}) + if isinstance(metric_data, dict): + value = metric_data.get(stat) + if value is not None: + return float(value) + except (KeyError, TypeError, ValueError): + pass + return None + + def parse_result_directory(self, result_dir: Path) -> list[dict[str, Any]]: + """Parse AIPerf result files in a directory. + + Args: + result_dir: Directory containing profile_export_aiperf.json + + Returns: + List of result dicts (usually just one for mooncake-router) + """ + results = [] + + # Look for AIPerf JSON files + for json_file in result_dir.rglob("profile_export_aiperf.json"): + result = self.parse_result_json(json_file) + if result.get("output_tps") is not None: + results.append(result) + + return results + + def find_aiperf_results(self, log_dir: Path) -> list[Path]: + """Find all AIPerf result files in a log directory. + + Args: + log_dir: Root log directory + + Returns: + List of paths to profile_export_aiperf.json files + """ + return list(log_dir.rglob("profile_export_aiperf.json")) + + def parse_launch_command(self, log_content: str) -> BenchmarkLaunchCommand | None: + """Parse the mooncake-router launch command from log content. + + Looks for command lines like: + [CMD] aiperf profile --model ... --url ... + genai-perf profile --model ... --endpoint ... + + Also parses header format: + Endpoint: http://localhost:8000 + Model: Qwen/Qwen3-32B + Workload: conversation + + Args: + log_content: Content of the benchmark log file + + Returns: + BenchmarkLaunchCommand with parsed parameters, or None if not found + """ + from analysis.srtlog.parsers import BenchmarkLaunchCommand + + raw_command = None + + # First, try to find [CMD] tagged command (preferred - from our scripts) + cmd_match = re.search(r"\[CMD\]\s*(.+)$", log_content, re.MULTILINE) + if cmd_match: + raw_command = cmd_match.group(1).strip() + + # Fallback: pattern to match genai-perf, aiperf or mooncake-router commands + # aiperf format: aiperf profile -m "Model" --url "http://..." --concurrency 10 + if not raw_command: + command_patterns = [ + r"(aiperf\s+profile\s+[^\n]+)", + r"(genai-perf\s+profile\s+[^\n]+)", + r"(python[3]?\s+.*genai_perf[^\n]+)", + r"(python[3]?\s+.*aiperf[^\n]+)", + r"(mooncake-router\s+[^\n]+)", + ] + + for pattern in command_patterns: + match = re.search(pattern, log_content, re.IGNORECASE) + if match: + raw_command = match.group(1).strip() + break + + # If no command found, try to build from header format + if not raw_command: + if "Mooncake Router Benchmark" in log_content: + raw_command = "mooncake-router-benchmark (from header)" + + if not raw_command: + return None + + extra_args: dict[str, Any] = {} + + # Parse aiperf/genai-perf arguments from command line + # Supports both --model and -m formats, quoted and unquoted values + arg_patterns = { + "model": r"(?:--model|-m)[=\s]+[\"']?([^\"'\s]+)[\"']?", + "base_url": r"--url[=\s]+[\"']?([^\"'\s]+)[\"']?", + "num_prompts": r"--(?:num-prompts|request-count|request)[=\s]+(\d+)", + "request_rate": r"--request-rate[=\s]+([^\s]+)", + "max_concurrency": r"--concurrency[=\s]+(\d+)", + "input_len": r"--(?:synthetic-input-tokens-mean|input-sequence-length|isl)[=\s]+(\d+)", + "output_len": r"--(?:output-tokens-mean|output-sequence-length|osl)[=\s]+(\d+)", + } + + for field, pattern in arg_patterns.items(): + match = re.search(pattern, raw_command) + if match: + value: Any = match.group(1) + if field in ("num_prompts", "max_concurrency", "input_len", "output_len"): + value = int(value) + elif field == "request_rate" and value != "inf": + try: + value = float(value) + except ValueError: + pass + extra_args[field] = value + + # Also parse from header format (srtctl-style) + header_patterns = { + "model": r"^Model:\s*(.+)$", + "base_url": r"^Endpoint:\s*(.+)$", + "dataset": r"^Workload:\s*(.+)$", + } + + for field, pattern in header_patterns.items(): + if field not in extra_args: + match = re.search(pattern, log_content, re.MULTILINE) + if match: + extra_args[field] = match.group(1).strip() + + return BenchmarkLaunchCommand( + benchmark_type=self.benchmark_type, + raw_command=raw_command, + extra_args=extra_args, + ) + diff --git a/analysis/srtlog/parsers/benchmark/sa_bench.py b/analysis/srtlog/parsers/benchmark/sa_bench.py new file mode 100644 index 00000000..ecb87b33 --- /dev/null +++ b/analysis/srtlog/parsers/benchmark/sa_bench.py @@ -0,0 +1,277 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""SA-Bench benchmark output parser.""" + +from __future__ import annotations + +import json +import logging +import re +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from analysis.srtlog.parsers import register_benchmark_parser + +if TYPE_CHECKING: + from analysis.srtlog.parsers import BenchmarkLaunchCommand + +logger = logging.getLogger(__name__) + + +@register_benchmark_parser("sa-bench") +class SABenchParser: + """Parser for SA-Bench benchmark output. + + Parses benchmark.out files and result JSON files from SA-Bench runs. + """ + + @property + def benchmark_type(self) -> str: + return "sa-bench" + + def parse(self, benchmark_out_path: Path) -> dict[str, Any]: + """Parse benchmark.out file for SA-Bench results. + + Args: + benchmark_out_path: Path to benchmark.out file + + Returns: + Dict with aggregated benchmark results + """ + results = { + "benchmark_type": self.benchmark_type, + "concurrencies": [], + "output_tps": [], + "mean_ttft_ms": [], + "mean_itl_ms": [], + "mean_tpot_ms": [], + "p99_ttft_ms": [], + "p99_itl_ms": [], + "request_throughput": [], + "completed_requests": [], + } + + if not benchmark_out_path.exists(): + logger.warning("benchmark.out not found: %s", benchmark_out_path) + return results + + try: + content = benchmark_out_path.read_text() + + # Parse summary lines from benchmark output + # Example: "Concurrency: 100, Throughput: 5000 tok/s, TTFT: 150ms, ITL: 20ms" + concurrency_pattern = r"Concurrency[:\s]+(\d+)" + throughput_pattern = r"(?:Output\s+)?[Tt]hroughput[:\s]+([\d.]+)" + ttft_pattern = r"(?:Mean\s+)?TTFT[:\s]+([\d.]+)" + itl_pattern = r"(?:Mean\s+)?ITL[:\s]+([\d.]+)" + + # Try to extract from summary lines + for line in content.split("\n"): + if "concurrency" in line.lower() or "throughput" in line.lower(): + conc_match = re.search(concurrency_pattern, line, re.IGNORECASE) + tpt_match = re.search(throughput_pattern, line, re.IGNORECASE) + ttft_match = re.search(ttft_pattern, line, re.IGNORECASE) + itl_match = re.search(itl_pattern, line, re.IGNORECASE) + + if conc_match and tpt_match: + results["concurrencies"].append(int(conc_match.group(1))) + results["output_tps"].append(float(tpt_match.group(1))) + if ttft_match: + results["mean_ttft_ms"].append(float(ttft_match.group(1))) + if itl_match: + results["mean_itl_ms"].append(float(itl_match.group(1))) + + except Exception as e: + logger.warning("Failed to parse benchmark.out: %s", e) + + return results + + def parse_result_json(self, json_path: Path) -> dict[str, Any]: + """Parse a SA-Bench result JSON file. + + Args: + json_path: Path to result JSON (e.g., result_c100.json) + + Returns: + Dict with benchmark metrics for this concurrency level + """ + result = {} + + try: + with open(json_path) as f: + data = json.load(f) + + # Return with same field names as original JSON for compatibility + # with downstream processing in _build_rollup_summary + result = { + "max_concurrency": data.get("max_concurrency"), + # Throughput metrics (keep original field names) + "output_throughput": data.get("output_throughput"), + "total_token_throughput": data.get("total_token_throughput"), + "request_throughput": data.get("request_throughput"), + "request_goodput": data.get("request_goodput"), + "request_rate": data.get("request_rate"), + # Mean latencies + "mean_ttft_ms": data.get("mean_ttft_ms"), + "mean_tpot_ms": data.get("mean_tpot_ms"), + "mean_itl_ms": data.get("mean_itl_ms"), + "mean_e2el_ms": data.get("mean_e2el_ms"), + # Median latencies + "median_ttft_ms": data.get("median_ttft_ms"), + "median_tpot_ms": data.get("median_tpot_ms"), + "median_itl_ms": data.get("median_itl_ms"), + "median_e2el_ms": data.get("median_e2el_ms"), + # P99 latencies + "p99_ttft_ms": data.get("p99_ttft_ms"), + "p99_tpot_ms": data.get("p99_tpot_ms"), + "p99_itl_ms": data.get("p99_itl_ms"), + "p99_e2el_ms": data.get("p99_e2el_ms"), + # Std dev latencies + "std_ttft_ms": data.get("std_ttft_ms"), + "std_tpot_ms": data.get("std_tpot_ms"), + "std_itl_ms": data.get("std_itl_ms"), + "std_e2el_ms": data.get("std_e2el_ms"), + # Token counts + "total_input_tokens": data.get("total_input_tokens"), + "total_output_tokens": data.get("total_output_tokens"), + # Metadata + "duration": data.get("duration"), + "completed": data.get("completed"), + "num_prompts": data.get("num_prompts"), + } + + except Exception as e: + logger.warning("Failed to parse %s: %s", json_path, e) + + return result + + def parse_result_directory(self, result_dir: Path) -> list[dict[str, Any]]: + """Parse all result JSON files in a benchmark result directory. + + Args: + result_dir: Directory containing result_*.json files + + Returns: + List of result dicts sorted by concurrency + """ + results = [] + + for json_file in result_dir.glob("*.json"): + result = self.parse_result_json(json_file) + if result.get("max_concurrency") is not None: + results.append(result) + + # Sort by concurrency + results.sort(key=lambda x: x.get("max_concurrency", 0) or 0) + + return results + + def parse_launch_command(self, log_content: str) -> BenchmarkLaunchCommand | None: + """Parse the SA-Bench launch command from log content. + + Looks for command lines like: + [CMD] python -m sglang.bench_serving --model ... --base-url ... + python -m sglang.bench_serving --model ... --base-url ... + + Also parses SA-Bench Config header format: + SA-Bench Config: endpoint=http://localhost:8000; isl=8192; osl=1024; ... + + Args: + log_content: Content of the benchmark log file + + Returns: + BenchmarkLaunchCommand with parsed parameters, or None if not found + """ + from analysis.srtlog.parsers import BenchmarkLaunchCommand + + raw_command = None + + # First, try to find [CMD] tagged command (preferred - from our scripts) + cmd_match = re.search(r"\[CMD\]\s*(.+)$", log_content, re.MULTILINE) + if cmd_match: + raw_command = cmd_match.group(1).strip() + + # Fallback: pattern to match sa-bench / sglang.bench_serving command + if not raw_command: + command_patterns = [ + r"(python[3]?\s+-m\s+sglang\.bench_serving\s+[^\n]+)", + r"(sa-bench\s+[^\n]+)", + r"(python[3]?\s+.*bench_serving\.py\s+[^\n]+)", + ] + + for pattern in command_patterns: + match = re.search(pattern, log_content, re.IGNORECASE) + if match: + raw_command = match.group(1).strip() + break + + # Also try SA-Bench Config header format + if not raw_command: + config_match = re.search(r"(SA-Bench Config:[^\n]+)", log_content) + if config_match: + raw_command = config_match.group(1).strip() + + if not raw_command: + return None + + extra_args: dict[str, Any] = {} + + # Parse common arguments from command line + arg_patterns = { + "model": r"--model[=\s]+([^\s]+)", + "base_url": r"--base-url[=\s]+([^\s]+)", + "num_prompts": r"--num-prompts?[=\s]+(\d+)", + "request_rate": r"--request-rate[=\s]+([^\s]+)", + "max_concurrency": r"--max-concurrency[=\s]+(\d+)", + "input_len": r"--(?:input-len|random-input-len)[=\s]+(\d+)", + "output_len": r"--(?:output-len|random-output-len)[=\s]+(\d+)", + "dataset": r"--dataset[=\s]+([^\s]+)", + "dataset_path": r"--dataset-path[=\s]+([^\s]+)", + } + + for field, pattern in arg_patterns.items(): + match = re.search(pattern, raw_command) + if match: + value: Any = match.group(1) + # Convert to appropriate type + if field in ("num_prompts", "max_concurrency", "input_len", "output_len"): + value = int(value) + elif field == "request_rate" and value != "inf": + try: + value = float(value) + except ValueError: + pass + extra_args[field] = value + + # Also parse from SA-Bench Config header format + # Format: SA-Bench Config: endpoint=http://localhost:8000; isl=8192; osl=1024; concurrencies=28; req_rate=inf; model=dsr1-fp8 + header_patterns = { + "base_url": r"endpoint=([^;\s]+)", + "model": r"model=([^;\s]+)", + "input_len": r"isl=(\d+)", + "output_len": r"osl=(\d+)", + "max_concurrency": r"concurrencies=(\d+)", + "request_rate": r"req_rate=([^;\s]+)", + } + + for field, pattern in header_patterns.items(): + if field not in extra_args: + match = re.search(pattern, raw_command) + if match: + value = match.group(1) + if field in ("input_len", "output_len", "max_concurrency"): + value = int(value) + elif field == "request_rate" and value != "inf": + try: + value = float(value) + except ValueError: + pass + extra_args[field] = value + + return BenchmarkLaunchCommand( + benchmark_type=self.benchmark_type, + raw_command=raw_command, + extra_args=extra_args, + ) + diff --git a/analysis/srtlog/parsers/nodes/__init__.py b/analysis/srtlog/parsers/nodes/__init__.py new file mode 100644 index 00000000..54c3d8dd --- /dev/null +++ b/analysis/srtlog/parsers/nodes/__init__.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Node log parsers for different backends.""" + +from analysis.srtlog.parsers.nodes.sglang import SGLangNodeParser +from analysis.srtlog.parsers.nodes.trtllm import TRTLLMNodeParser + +__all__ = ["SGLangNodeParser", "TRTLLMNodeParser"] + diff --git a/analysis/srtlog/parsers/nodes/sglang.py b/analysis/srtlog/parsers/nodes/sglang.py new file mode 100644 index 00000000..9fbc6e6a --- /dev/null +++ b/analysis/srtlog/parsers/nodes/sglang.py @@ -0,0 +1,431 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""SGLang node log parser. + +Parses logs with format: + [2m2025-12-30T15:52:38.206058Z[0m [32m INFO[0m ... Decode batch, #running-req: 5, ... + +This parser handles SGLang structured logging format with ISO 8601 timestamps. +""" + +from __future__ import annotations + +import logging +import os +import re +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from analysis.srtlog.models import BatchMetrics, MemoryMetrics, NodeMetrics +from analysis.srtlog.parsers import register_node_parser + +if TYPE_CHECKING: + from analysis.srtlog.parsers import NodeLaunchCommand + +logger = logging.getLogger(__name__) + + +# ANSI escape code pattern for stripping colors +ANSI_ESCAPE = re.compile(r"\x1b\[[0-9;]*m") + + +@register_node_parser("sglang") +class SGLangNodeParser: + """Parser for SGLang node logs. + + Handles SGLang structured logging with ISO 8601 timestamps. + May contain ANSI color codes which are stripped during parsing. + """ + + @property + def backend_type(self) -> str: + return "sglang" + + def parse_logs(self, log_dir: Path) -> list[NodeMetrics]: + """Parse all prefill/decode/agg log files in a directory. + + Args: + log_dir: Directory containing *_prefill_*.out, *_decode_*.out, *_agg_*.out files + + Returns: + List of NodeMetrics objects + """ + log_dir = Path(log_dir) + nodes = [] + + if not log_dir.exists(): + logger.error("Log directory does not exist: %s", log_dir) + return nodes + + # Find all worker log files + for file in os.listdir(log_dir): + if not (file.endswith(".err") or file.endswith(".out")): + continue + if not any(wt in file for wt in ("prefill", "decode", "agg")): + continue + + filepath = log_dir / file + node = self.parse_single_log(filepath) + if node: + nodes.append(node) + + logger.info("Parsed %d node log files from %s", len(nodes), log_dir) + return nodes + + def parse_single_log(self, log_path: Path) -> NodeMetrics | None: + """Parse a single node log file. + + Args: + log_path: Path to a prefill/decode/agg log file + + Returns: + NodeMetrics object or None if parsing failed + """ + node_info = self._extract_node_info_from_filename(str(log_path)) + if not node_info: + logger.warning( + "Could not extract node info from filename: %s. " + "Expected format: __.err or .out", + log_path, + ) + return None + + batches = [] + memory_snapshots = [] + config = {} + + try: + with open(log_path) as f: + for line in f: + # Strip ANSI escape codes + clean_line = ANSI_ESCAPE.sub("", line) + + # Parse prefill batch metrics + batch_metrics = self._parse_prefill_batch_line(clean_line) + if batch_metrics: + batches.append( + BatchMetrics( + timestamp=batch_metrics["timestamp"], + dp=0, # Default since not in log + tp=0, + ep=0, + batch_type=batch_metrics["type"], + new_seq=batch_metrics.get("new_seq"), + new_token=batch_metrics.get("new_token"), + cached_token=batch_metrics.get("cached_token"), + token_usage=batch_metrics.get("token_usage"), + running_req=batch_metrics.get("running_req"), + queue_req=batch_metrics.get("queue_req"), + prealloc_req=batch_metrics.get("prealloc_req"), + inflight_req=batch_metrics.get("inflight_req"), + input_throughput=batch_metrics.get("input_throughput"), + ) + ) + + # Parse decode batch metrics + decode_metrics = self._parse_decode_batch_line(clean_line) + if decode_metrics: + batches.append( + BatchMetrics( + timestamp=decode_metrics["timestamp"], + dp=0, + tp=0, + ep=0, + batch_type=decode_metrics["type"], + running_req=decode_metrics.get("running_req"), + queue_req=decode_metrics.get("queue_req"), + prealloc_req=decode_metrics.get("prealloc_req"), + transfer_req=decode_metrics.get("transfer_req"), + token_usage=decode_metrics.get("token_usage"), + preallocated_usage=decode_metrics.get("preallocated_usage"), + num_tokens=decode_metrics.get("num_tokens"), + gen_throughput=decode_metrics.get("gen_throughput"), + ) + ) + + # Parse memory metrics + mem_metrics = self._parse_memory_line(clean_line) + if mem_metrics: + memory_snapshots.append( + MemoryMetrics( + timestamp=mem_metrics["timestamp"], + dp=0, + tp=0, + ep=0, + metric_type=mem_metrics["type"], + avail_mem_gb=mem_metrics.get("avail_mem_gb"), + mem_usage_gb=mem_metrics.get("mem_usage_gb"), + kv_cache_gb=mem_metrics.get("kv_cache_gb"), + kv_tokens=mem_metrics.get("kv_tokens"), + ) + ) + + # Extract TP/DP/EP configuration from server_args + if "tp_size=" in clean_line: + tp_match = re.search(r"tp_size=(\d+)", clean_line) + dp_match = re.search(r"dp_size=(\d+)", clean_line) + ep_match = re.search(r"ep_size=(\d+)", clean_line) + + if tp_match: + config["tp_size"] = int(tp_match.group(1)) + if dp_match: + config["dp_size"] = int(dp_match.group(1)) + if ep_match: + config["ep_size"] = int(ep_match.group(1)) + + except Exception as e: + logger.error("Error parsing %s: %s", log_path, e) + return None + + total_metrics = len(batches) + len(memory_snapshots) + if total_metrics == 0: + logger.debug("Parsed %s but found no batch/memory metrics", log_path) + + logger.debug("Parsed %s: %d batches, %d memory snapshots", log_path, len(batches), len(memory_snapshots)) + + return NodeMetrics( + node_info=node_info, + batches=batches, + memory_snapshots=memory_snapshots, + config=config, + ) + + def _parse_timestamp(self, line: str) -> str | None: + """Extract ISO 8601 timestamp from log line. + + Example: 2025-12-30T15:52:38.206058Z + """ + match = re.search(r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z?)", line) + if match: + return match.group(1) + return None + + def _parse_prefill_batch_line(self, line: str) -> dict | None: + """Parse prefill batch log line for metrics.""" + if "Prefill batch" not in line: + return None + + timestamp = self._parse_timestamp(line) + if not timestamp: + return None + + metrics = {"timestamp": timestamp, "type": "prefill"} + + patterns = { + "new_seq": r"#new-seq:\s*(\d+)", + "new_token": r"#new-token:\s*(\d+)", + "cached_token": r"#cached-token:\s*(\d+)", + "token_usage": r"token usage:\s*([\d.]+)", + "running_req": r"#running-req:\s*(\d+)", + "queue_req": r"#queue-req:\s*(\d+)", + "prealloc_req": r"#prealloc-req:\s*(\d+)", + "inflight_req": r"#inflight-req:\s*(\d+)", + "input_throughput": r"input throughput \(token/s\):\s*([\d.]+)", + } + + for key, pattern in patterns.items(): + match = re.search(pattern, line) + if match: + value = match.group(1) + metrics[key] = float(value) if "." in value else int(value) + + return metrics + + def _parse_decode_batch_line(self, line: str) -> dict | None: + """Parse decode batch log line for metrics.""" + if "Decode batch" not in line: + return None + + timestamp = self._parse_timestamp(line) + if not timestamp: + return None + + metrics = {"timestamp": timestamp, "type": "decode"} + + patterns = { + "running_req": r"#running-req:\s*(\d+)", + "num_tokens": r"#token:\s*(\d+)", + "token_usage": r"token usage:\s*([\d.]+)", + "preallocated_usage": r"pre-allocated usage:\s*([\d.]+)", + "prealloc_req": r"#prealloc-req:\s*(\d+)", + "transfer_req": r"#transfer-req:\s*(\d+)", + "queue_req": r"#queue-req:\s*(\d+)", + "gen_throughput": r"gen throughput \(token/s\):\s*([\d.]+)", + } + + for key, pattern in patterns.items(): + match = re.search(pattern, line) + if match: + value = match.group(1) + metrics[key] = float(value) if "." in value else int(value) + + return metrics + + def _parse_memory_line(self, line: str) -> dict | None: + """Parse memory-related log lines.""" + timestamp = self._parse_timestamp(line) + if not timestamp: + return None + + metrics = {"timestamp": timestamp} + + # Parse available memory from "avail mem=75.11 GB" + avail_match = re.search(r"avail mem=([\d.]+)\s*GB", line) + if avail_match: + metrics["avail_mem_gb"] = float(avail_match.group(1)) + metrics["type"] = "memory" + + # Parse memory usage from "mem usage=107.07 GB" + usage_match = re.search(r"mem usage=([\d.]+)\s*GB", line) + if usage_match: + metrics["mem_usage_gb"] = float(usage_match.group(1)) + metrics["type"] = "memory" + + # Parse KV cache size from "KV size: 17.16 GB" + kv_match = re.search(r"KV size:\s*([\d.]+)\s*GB", line) + if kv_match: + metrics["kv_cache_gb"] = float(kv_match.group(1)) + metrics["type"] = "kv_cache" + + # Parse token count from "#tokens: 524288" + token_match = re.search(r"#tokens:\s*(\d+)", line) + if token_match: + metrics["kv_tokens"] = int(token_match.group(1)) + + # Parse from "Capturing batches" progress lines + # Example: "Capturing batches (bs=256 avail_mem=6.32 GB)" + capture_match = re.search(r"avail_mem=([\d.]+)\s*GB", line) + if capture_match and "type" not in metrics: + metrics["avail_mem_gb"] = float(capture_match.group(1)) + metrics["type"] = "memory" + + return metrics if "type" in metrics else None + + def _extract_node_info_from_filename(self, filename: str) -> dict | None: + """Extract node name and worker info from filename. + + Example: eos0219_prefill_w0.out + Returns: {'node': 'eos0219', 'worker_type': 'prefill', 'worker_id': 'w0'} + """ + match = re.match( + r"(.+)_(prefill|decode|agg|frontend)_([^.]+)\.(err|out)", + os.path.basename(filename), + ) + if match: + return { + "node": match.group(1), + "worker_type": match.group(2), + "worker_id": match.group(3), + } + return None + + def parse_launch_command(self, log_content: str, worker_type: str = "unknown") -> NodeLaunchCommand | None: + """Parse the SGLang worker launch command from log content. + + Looks for command lines or ServerArgs in the log. + + Args: + log_content: Content of the worker log file + worker_type: Type of worker (prefill, decode, agg) + + Returns: + NodeLaunchCommand with parsed parameters, or None if not found + """ + from analysis.srtlog.parsers import NodeLaunchCommand + + # Strip ANSI codes for cleaner parsing + clean_content = ANSI_ESCAPE.sub("", log_content) + + raw_command = None + + # First, try to find [CMD] tagged command (preferred - from our scripts) + cmd_match = re.search(r"\[CMD\]\s*(.+)$", clean_content, re.MULTILINE) + if cmd_match: + raw_command = cmd_match.group(1).strip() + + # Fallback: pattern to match sglang launch commands + if not raw_command: + patterns = [ + r"(python[3]?\s+-m\s+sglang\.launch_server\s+[^\n]+)", + r"(python[3]?\s+.*launch_server\.py\s+[^\n]+)", + r"(sglang\.launch_server\s+[^\n]+)", + ] + + for pattern in patterns: + match = re.search(pattern, clean_content, re.IGNORECASE) + if match: + raw_command = match.group(1).strip() + break + + # Also try to parse from ServerArgs() log line + if not raw_command: + server_args_match = re.search(r"server_args=ServerArgs\((.*?)\)", clean_content, re.DOTALL) + if server_args_match: + raw_command = f"ServerArgs({server_args_match.group(1)[:200]}...)" + + if not raw_command: + return None + + extra_args: dict[str, Any] = {} + + # Parse SGLang server arguments (from command line) + arg_patterns = { + "model_path": r"--model(?:-path)?[=\s]+([^\s]+)", + "served_model_name": r"--served-model-name[=\s]+([^\s]+)", + "tp_size": r"--tp-size[=\s]+(\d+)", + "pp_size": r"--pp-size[=\s]+(\d+)", + "dp_size": r"--dp-size[=\s]+(\d+)", + "ep_size": r"--ep-size[=\s]+(\d+)", + "host": r"--host[=\s]+([^\s]+)", + "port": r"--port[=\s]+(\d+)", + "max_num_seqs": r"--max-(?:num-seqs|running-requests)[=\s]+(\d+)", + "max_model_len": r"--(?:max-model-len|context-length)[=\s]+(\d+)", + "kv_cache_dtype": r"--kv-cache-dtype[=\s]+([^\s]+)", + "gpu_memory_utilization": r"--(?:mem-fraction-static|gpu-memory-utilization)[=\s]+([\d.]+)", + "disaggregation_mode": r"--disaggregation-mode[=\s]+([^\s]+)", + "nccl_init_addr": r"--(?:dist-init-addr|nccl-init-addr)[=\s]+([^\s]+)", + } + + # Also parse from ServerArgs format + server_args_patterns = { + "model_path": r"model_path=['\"]?([^'\"]+)['\"]?", + "served_model_name": r"served_model_name=['\"]?([^'\"]+)['\"]?", + "tp_size": r"tp_size=(\d+)", + "pp_size": r"pp_size=(\d+)", + "dp_size": r"dp_size=(\d+)", + "ep_size": r"ep_size=(\d+)", + "host": r"host=['\"]?([^'\"]+)['\"]?", + "port": r"port=(\d+)", + "max_num_seqs": r"max_running_requests=(\d+)", + "max_model_len": r"context_length=(\d+)", + "disaggregation_mode": r"disaggregation_mode=['\"]?([^'\"]+)['\"]?", + } + + for field, pattern in arg_patterns.items(): + match = re.search(pattern, raw_command) + if match: + value: Any = match.group(1) + if field in ("tp_size", "pp_size", "dp_size", "ep_size", "port", "max_num_seqs", "max_model_len"): + value = int(value) + elif field == "gpu_memory_utilization": + value = float(value) + extra_args[field] = value + + # Try ServerArgs patterns for any missing fields + for field, pattern in server_args_patterns.items(): + if field not in extra_args: + match = re.search(pattern, clean_content) + if match: + value = match.group(1) + if field in ("tp_size", "pp_size", "dp_size", "ep_size", "port", "max_num_seqs", "max_model_len"): + value = int(value) + extra_args[field] = value + + return NodeLaunchCommand( + backend_type="sglang", + worker_type=worker_type, + raw_command=raw_command, + extra_args=extra_args, + ) + diff --git a/analysis/srtlog/parsers/nodes/trtllm.py b/analysis/srtlog/parsers/nodes/trtllm.py new file mode 100644 index 00000000..7f5aa0e8 --- /dev/null +++ b/analysis/srtlog/parsers/nodes/trtllm.py @@ -0,0 +1,469 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""TRTLLM node log parser. + +Parses logs from TensorRT-LLM workers launched via dynamo.trtllm. +Example log format: + [33mRank0 run python3 -m dynamo.trtllm --model-path /model --served-model-name dsr1-fp8 ... + Initializing the worker with config: Config(namespace=dynamo, component=prefill, ...) +""" + +from __future__ import annotations + +import logging +import os +import re +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from analysis.srtlog.models import BatchMetrics, MemoryMetrics, NodeMetrics +from analysis.srtlog.parsers import register_node_parser + +if TYPE_CHECKING: + from analysis.srtlog.parsers import NodeLaunchCommand + +logger = logging.getLogger(__name__) + +# ANSI escape code pattern for stripping colors +ANSI_ESCAPE = re.compile(r"\x1b\[[0-9;]*m") + + +@register_node_parser("trtllm") +class TRTLLMNodeParser: + """Parser for TensorRT-LLM node logs. + + Parses logs from TRTLLM workers, including: + - Launch command from dynamo.trtllm + - Worker configuration from Config() dump + - MPI rank and world size information + """ + + @property + def backend_type(self) -> str: + return "trtllm" + + def parse_logs(self, log_dir: Path) -> list[NodeMetrics]: + """Parse all TRTLLM node logs in a directory. + + Args: + log_dir: Directory containing *_prefill_*.out, *_decode_*.out files + + Returns: + List of NodeMetrics objects + """ + log_dir = Path(log_dir) + nodes = [] + + if not log_dir.exists(): + logger.error("Log directory does not exist: %s", log_dir) + return nodes + + # Find all worker log files + for file in os.listdir(log_dir): + if not (file.endswith(".err") or file.endswith(".out")): + continue + if not any(wt in file for wt in ("prefill", "decode", "agg")): + continue + + filepath = log_dir / file + node = self.parse_single_log(filepath) + if node: + nodes.append(node) + + logger.info("Parsed %d TRTLLM node log files from %s", len(nodes), log_dir) + return nodes + + def parse_single_log(self, log_path: Path) -> NodeMetrics | None: + """Parse a single TRTLLM log file. + + Args: + log_path: Path to a node log file + + Returns: + NodeMetrics object or None if parsing failed + """ + node_info = self._extract_node_info_from_filename(str(log_path)) + if not node_info: + logger.warning("Could not extract node info from filename: %s", log_path) + return None + + batches = [] + memory_snapshots = [] + config = {} + + try: + # Handle encoding issues gracefully + content = log_path.read_text(errors="replace") + clean_content = ANSI_ESCAPE.sub("", content) + + # Extract MPI configuration + mpi_size_match = re.search(r"tllm_mpi_size:\s*(\d+)", clean_content) + if mpi_size_match: + config["mpi_world_size"] = int(mpi_size_match.group(1)) + + # Extract TP/PP from Config() dump + config_match = re.search(r"Config\((.*?)\)", clean_content) + if config_match: + config_str = config_match.group(1) + + tp_match = re.search(r"tensor_parallel_size=(\d+)", config_str) + if tp_match: + config["tp_size"] = int(tp_match.group(1)) + + pp_match = re.search(r"pipeline_parallel_size=(\d+)", config_str) + if pp_match: + config["pp_size"] = int(pp_match.group(1)) + + ep_match = re.search(r"expert_parallel_size=(\d+)", config_str) + if ep_match: + config["ep_size"] = int(ep_match.group(1)) + + max_batch_match = re.search(r"max_batch_size=(\d+)", config_str) + if max_batch_match: + config["max_batch_size"] = int(max_batch_match.group(1)) + + max_tokens_match = re.search(r"max_num_tokens=(\d+)", config_str) + if max_tokens_match: + config["max_num_tokens"] = int(max_tokens_match.group(1)) + + max_seq_match = re.search(r"max_seq_len=(\d+)", config_str) + if max_seq_match: + config["max_seq_len"] = int(max_seq_match.group(1)) + + # Extract from separate trtllm_config YAML references + yaml_match = re.search(r"extra_engine_args=([^\s,]+\.yaml)", clean_content) + if yaml_match: + config["extra_engine_args"] = yaml_match.group(1) + + # Also extract from TensorRT-LLM engine args line which has actual parallelism + engine_args_match = re.search(r"TensorRT-LLM engine args:\s*\{([^}]+)", clean_content) + if engine_args_match: + engine_str = engine_args_match.group(1) + + engine_tp_match = re.search(r"'tensor_parallel_size':\s*(\d+)", engine_str) + if engine_tp_match: + config["tp_size"] = int(engine_tp_match.group(1)) + + engine_pp_match = re.search(r"'pipeline_parallel_size':\s*(\d+)", engine_str) + if engine_pp_match: + config["pp_size"] = int(engine_pp_match.group(1)) + + engine_ep_match = re.search(r"'moe_expert_parallel_size':\s*(\d+)", engine_str) + if engine_ep_match: + config["ep_size"] = int(engine_ep_match.group(1)) + + engine_batch_match = re.search(r"'max_batch_size':\s*(\d+)", engine_str) + if engine_batch_match: + config["max_batch_size"] = int(engine_batch_match.group(1)) + + engine_tokens_match = re.search(r"'max_num_tokens':\s*(\d+)", engine_str) + if engine_tokens_match: + config["max_num_tokens"] = int(engine_tokens_match.group(1)) + + engine_seq_match = re.search(r"'max_seq_len':\s*(\d+)", engine_str) + if engine_seq_match: + config["max_seq_len"] = int(engine_seq_match.group(1)) + + # Parse iteration logs for batch metrics + # Format: iter = X, ... num_scheduled_requests: X, states = {'num_ctx_requests': X, 'num_ctx_tokens': X, 'num_generation_tokens': X} + batches = self._parse_iteration_logs(clean_content, node_info.get("worker_type", "unknown")) + + # Parse memory info + memory_snapshots = self._parse_memory_info(clean_content) + + except Exception as e: + logger.error("Error parsing %s: %s", log_path, e) + return None + + logger.debug("Parsed %s: %d batches, %d memory snapshots, config=%s", log_path, len(batches), len(memory_snapshots), config) + + return NodeMetrics( + node_info=node_info, + batches=batches, + memory_snapshots=memory_snapshots, + config=config, + ) + + def _parse_iteration_logs(self, content: str, worker_type: str) -> list[BatchMetrics]: + """Parse TRTLLM iteration logs for batch metrics. + + Format: + [01/16/2026-06:20:17] [TRT-LLM] [RANK 0] [I] iter = 5559, ..., num_scheduled_requests: 1, + states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 3} + + Args: + content: Log file content (ANSI stripped) + worker_type: Worker type (prefill, decode) + + Returns: + List of BatchMetrics objects + """ + batches = [] + + # Pattern to match TRTLLM iteration logs + iter_pattern = re.compile( + r"\[(\d{2}/\d{2}/\d{4}-\d{2}:\d{2}:\d{2})\].*" + r"iter\s*=\s*(\d+).*" + r"num_scheduled_requests:\s*(\d+).*" + r"states\s*=\s*\{([^}]+)\}" + ) + + for match in iter_pattern.finditer(content): + timestamp = match.group(1) + iteration = int(match.group(2)) + num_scheduled = int(match.group(3)) + states_str = match.group(4) + + # Parse states dict + ctx_requests = 0 + ctx_tokens = 0 + gen_tokens = 0 + + ctx_req_match = re.search(r"'num_ctx_requests':\s*(\d+)", states_str) + if ctx_req_match: + ctx_requests = int(ctx_req_match.group(1)) + + ctx_tok_match = re.search(r"'num_ctx_tokens':\s*(\d+)", states_str) + if ctx_tok_match: + ctx_tokens = int(ctx_tok_match.group(1)) + + gen_tok_match = re.search(r"'num_generation_tokens':\s*(\d+)", states_str) + if gen_tok_match: + gen_tokens = int(gen_tok_match.group(1)) + + # Determine batch type based on content + if ctx_tokens > 0: + batch_type = "prefill" + elif gen_tokens > 0: + batch_type = "decode" + else: + batch_type = worker_type + + # Parse step time if available + step_time = None + step_match = re.search(r"host_step_time\s*=\s*([\d.]+)ms", match.group(0)) + if step_match: + step_time = float(step_match.group(1)) + + # Compute throughput (tokens/s) + input_throughput = None + gen_throughput = None + if step_time and step_time > 0: + if batch_type == "prefill" and ctx_tokens > 0: + # Prefill throughput: context tokens / step time + input_throughput = (ctx_tokens * 1000.0) / step_time + elif batch_type == "decode" and gen_tokens > 0: + # Decode throughput: generation tokens / step time + gen_throughput = (gen_tokens * 1000.0) / step_time + + batches.append( + BatchMetrics( + timestamp=timestamp, + dp=0, + tp=0, + ep=0, + batch_type=batch_type, + running_req=num_scheduled, + new_token=ctx_tokens if batch_type == "prefill" else None, + num_tokens=gen_tokens if batch_type == "decode" else None, + input_throughput=input_throughput, + gen_throughput=gen_throughput, + ) + ) + + return batches + + def _parse_memory_info(self, content: str) -> list[MemoryMetrics]: + """Parse TRTLLM memory information. + + Format: + Peak memory during memory usage profiling (torch + non-torch): 91.46 GiB, + available KV cache memory when calculating max tokens: 41.11 GiB, + fraction is set 0.85, kv size is 35136. device total memory 139.81 GiB + + Args: + content: Log file content (ANSI stripped) + + Returns: + List of MemoryMetrics objects + """ + memory_snapshots = [] + + # Pattern to match memory info + mem_pattern = re.compile( + r"\[(\d{2}/\d{2}/\d{4}-\d{2}:\d{2}:\d{2})\].*" + r"Peak memory.*?:\s*([\d.]+)\s*GiB.*?" + r"available KV cache memory.*?:\s*([\d.]+)\s*GiB.*?" + r"device total memory\s*([\d.]+)\s*GiB" + ) + + for match in mem_pattern.finditer(content): + timestamp = match.group(1) + peak_mem = float(match.group(2)) + avail_kv = float(match.group(3)) + total_mem = float(match.group(4)) + + memory_snapshots.append( + MemoryMetrics( + timestamp=timestamp, + dp=0, + tp=0, + ep=0, + metric_type="memory", + mem_usage_gb=peak_mem, + avail_mem_gb=total_mem - peak_mem, + kv_cache_gb=avail_kv, + ) + ) + + # Also parse KV cache allocation info + kv_alloc_pattern = re.compile( + r"\[MemUsageChange\] Allocated\s*([\d.]+)\s*GiB for max tokens.*?\((\d+)\)" + ) + + for match in kv_alloc_pattern.finditer(content): + kv_gb = float(match.group(1)) + max_tokens = int(match.group(2)) + + memory_snapshots.append( + MemoryMetrics( + timestamp="", + dp=0, + tp=0, + ep=0, + metric_type="kv_cache", + kv_cache_gb=kv_gb, + kv_tokens=max_tokens, + ) + ) + + return memory_snapshots + + def _extract_node_info_from_filename(self, filename: str) -> dict | None: + """Extract node name and worker info from filename. + + Example: worker-0_prefill_w0.out + Returns: {'node': 'worker-0', 'worker_type': 'prefill', 'worker_id': 'w0'} + """ + match = re.match( + r"(.+)_(prefill|decode|agg|frontend)_([^.]+)\.(err|out)", + os.path.basename(filename), + ) + if match: + return { + "node": match.group(1), + "worker_type": match.group(2), + "worker_id": match.group(3), + } + return None + + def parse_launch_command(self, log_content: str, worker_type: str = "unknown") -> NodeLaunchCommand | None: + """Parse the TRTLLM worker launch command from log content. + + Looks for command lines like: + [CMD] python3 -m dynamo.trtllm --model-path /model --served-model-name dsr1-fp8 --disaggregation-mode prefill + python3 -m dynamo.trtllm --model-path /model --served-model-name dsr1-fp8 --disaggregation-mode prefill + + Args: + log_content: Content of the worker log file + worker_type: Type of worker (prefill, decode, agg) + + Returns: + NodeLaunchCommand with parsed parameters, or None if not found + """ + from analysis.srtlog.parsers import NodeLaunchCommand + + # Strip ANSI codes for cleaner parsing + clean_content = ANSI_ESCAPE.sub("", log_content) + + raw_command = None + + # First, try to find [CMD] tagged command (preferred - from our scripts) + cmd_match = re.search(r"\[CMD\]\s*(.+)$", clean_content, re.MULTILINE) + if cmd_match: + raw_command = cmd_match.group(1).strip() + + # Fallback: pattern to match TRTLLM launch commands (dynamo.trtllm or tensorrt_llm.serve) + if not raw_command: + patterns = [ + r"(?:Rank\d+\s+run\s+)?(python[3]?\s+-m\s+dynamo\.trtllm\s+[^\n]+)", + r"(?:Rank\d+\s+run\s+)?(python[3]?\s+-m\s+tensorrt_llm\.serve\s+[^\n]+)", + r"(trtllm-serve\s+[^\n]+)", + r"(mpirun\s+.*trtllm[^\n]+)", + ] + + for pattern in patterns: + match = re.search(pattern, clean_content) + if match: + raw_command = match.group(1).strip() + # Remove trailing "in background" if present + raw_command = re.sub(r"\s+in\s+background$", "", raw_command) + break + + if not raw_command: + return None + + extra_args: dict[str, Any] = {} + + # Parse dynamo.trtllm / tensorrt_llm server arguments from command line + arg_patterns = { + "model_path": r"--model-path[=\s]+([^\s]+)", + "served_model_name": r"--served-model-name[=\s]+([^\s]+)", + "disaggregation_mode": r"--disaggregation-mode[=\s]+([^\s]+)", + "host": r"--host[=\s]+([^\s]+)", + "port": r"--port[=\s]+(\d+)", + } + + for field, pattern in arg_patterns.items(): + match = re.search(pattern, raw_command) + if match: + value = match.group(1) + if field == "port": + value = int(value) + extra_args[field] = value + + # Also extract from TensorRT-LLM engine args if available (has actual parallelism values) + engine_args_match = re.search(r"TensorRT-LLM engine args:\s*\{([^}]+)", clean_content) + if engine_args_match: + engine_str = engine_args_match.group(1) + + engine_patterns = { + "tp_size": r"'tensor_parallel_size':\s*(\d+)", + "pp_size": r"'pipeline_parallel_size':\s*(\d+)", + "max_num_seqs": r"'max_batch_size':\s*(\d+)", + "max_model_len": r"'max_seq_len':\s*(\d+)", + } + + for field, pattern in engine_patterns.items(): + if field not in extra_args: + match = re.search(pattern, engine_str) + if match: + extra_args[field] = int(match.group(1)) + + # Fallback to Config() dump + if "tp_size" not in extra_args: + config_match = re.search(r"Config\((.*?)\)", clean_content) + if config_match: + config_str = config_match.group(1) + + config_patterns = { + "tp_size": r"tensor_parallel_size=(\d+)", + "pp_size": r"pipeline_parallel_size=(\d+)", + "max_num_seqs": r"max_batch_size=(\d+)", + "max_model_len": r"max_seq_len=(\d+)", + } + + for field, pattern in config_patterns.items(): + if field not in extra_args: + match = re.search(pattern, config_str) + if match: + extra_args[field] = int(match.group(1)) + + return NodeLaunchCommand( + backend_type=self.backend_type, + worker_type=worker_type, + raw_command=raw_command, + extra_args=extra_args, + ) + diff --git a/analysis/srtlog/rollup_harness.py b/analysis/srtlog/rollup_harness.py new file mode 100644 index 00000000..24b6e880 --- /dev/null +++ b/analysis/srtlog/rollup_harness.py @@ -0,0 +1,474 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Rollup harness for batch processing experiment logs. + +Recursively searches for sbatch_script.sh files and runs rollup on each job directory. + +Usage: + python -m analysis.srtlog.rollup_harness --log-dir /path/to/outputs + python -m analysis.srtlog.rollup_harness --log-dir /path/to/outputs --dry-run + python -m analysis.srtlog.rollup_harness --log-dir /path/to/outputs --output-dir /path/to/rollups +""" + +from __future__ import annotations + +import argparse +import json +import logging +import sys +from dataclasses import asdict +from datetime import datetime +from pathlib import Path +from typing import Any + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + + +def find_job_directories(log_dir: Path) -> list[Path]: + """Find all job directories by searching for sbatch_script.sh files. + + Args: + log_dir: Root directory to search + + Returns: + List of job directory paths (parent dirs of sbatch_script.sh) + """ + job_dirs = [] + for sbatch_script in log_dir.rglob("sbatch_script.sh"): + job_dir = sbatch_script.parent + job_dirs.append(job_dir) + + # Sort by job ID (directory name) if numeric + job_dirs.sort(key=lambda p: (int(p.name) if p.name.isdigit() else p.name)) + return job_dirs + + +def load_job_config(job_dir: Path) -> dict[str, Any] | None: + """Load job configuration from config.yaml. + + Args: + job_dir: Job directory containing config.yaml + + Returns: + Parsed config dict or None if not found + """ + config_path = job_dir / "config.yaml" + if not config_path.exists(): + return None + + try: + import yaml + with open(config_path) as f: + return yaml.safe_load(f) + except Exception as e: + logger.warning("Failed to load %s: %s", config_path, e) + return None + + +def get_log_dir(job_dir: Path) -> Path | None: + """Get the logs directory for a job. + + Args: + job_dir: Job directory + + Returns: + Path to logs directory or None if not found + """ + logs_dir = job_dir / "logs" + if logs_dir.exists(): + return logs_dir + return None + + +def _add_launch_command_to_node(node_rollup: Any, node_parser: Any, logs_dir: Path) -> None: + """Parse and add launch command to a node rollup. + + Args: + node_rollup: NodeRollup object to update + node_parser: Node parser with parse_launch_command method + logs_dir: Directory containing log files + """ + from srtctl.cli.mixins.rollup import LaunchCommandRollup + + node_name = node_rollup.node_name + worker_type = node_rollup.worker_type + worker_id = node_rollup.worker_id + + # Try different filename patterns + patterns = [ + f"{node_name}_{worker_type}_{worker_id}", # worker-4_decode_w0 + f"worker-*_{worker_type}_{worker_id}", # wildcard node + ] + + for pattern in patterns: + # Try both .out and .err files + for ext in [".out", ".err"]: + if "*" in pattern: + # Glob pattern + for log_file in logs_dir.glob(f"{pattern}{ext}"): + if _try_parse_launch_command(node_rollup, node_parser, log_file, worker_type): + return + else: + log_file = logs_dir / f"{pattern}{ext}" + if log_file.exists(): + if _try_parse_launch_command(node_rollup, node_parser, log_file, worker_type): + return + + +def _try_parse_launch_command(node_rollup: Any, node_parser: Any, log_file: Path, worker_type: str) -> bool: + """Try to parse launch command from a log file. + + Args: + node_rollup: NodeRollup object to update + node_parser: Node parser with parse_launch_command method + log_file: Log file to parse + worker_type: Worker type (prefill, decode, agg) + + Returns: + True if command was found and added + """ + from srtctl.cli.mixins.rollup import LaunchCommandRollup + + try: + content = log_file.read_text(errors="replace") + cmd = node_parser.parse_launch_command(content, worker_type=worker_type) + if cmd: + args = cmd.extra_args + node_rollup.launch_command = LaunchCommandRollup( + raw_command=cmd.raw_command, + command_type="worker", + model_path=args.get("model_path"), + served_model_name=args.get("served_model_name"), + worker_type=worker_type, + backend_type=cmd.backend_type, + disaggregation_mode=args.get("disaggregation_mode"), + tp_size=args.get("tp_size"), + pp_size=args.get("pp_size"), + dp_size=args.get("dp_size"), + ep_size=args.get("ep_size"), + port=args.get("port"), + max_num_seqs=args.get("max_num_seqs"), + max_model_len=args.get("max_model_len"), + ) + logger.debug("Parsed launch command for %s from %s", node_rollup.node_name, log_file.name) + return True + except Exception as e: + logger.debug("Failed to parse launch command from %s: %s", log_file, e) + + return False + + +def run_rollup_on_job(job_dir: Path, output_dir: Path | None = None) -> dict[str, Any] | None: + """Run rollup on a single job directory. + + Args: + job_dir: Job directory containing config.yaml and logs/ + output_dir: Optional output directory for rollup.json + + Returns: + Rollup summary dict or None if failed + """ + from analysis.srtlog.parsers import get_benchmark_parser, get_node_parser, list_benchmark_parsers, list_node_parsers + from srtctl.cli.mixins.rollup import ( + EnvironmentConfig, + LaunchCommandRollup, + NodesSummary, + RollupResult, + RollupSummary, + ) + + job_id = job_dir.name + logs_dir = get_log_dir(job_dir) + + if not logs_dir: + logger.warning("No logs directory found in %s", job_dir) + return None + + config = load_job_config(job_dir) + if not config: + logger.warning("No config.yaml found in %s", job_dir) + return None + + # Extract config values + backend_type = config.get("backend", {}).get("type", "unknown") + benchmark_type = config.get("benchmark", {}).get("type", "sa-bench") + model_name = config.get("model", {}).get("served_model_name", "unknown") + + resources = config.get("resources", {}) + is_disaggregated = resources.get("prefill_nodes") is not None + + logger.info("Processing job %s: backend=%s, benchmark=%s", job_id, backend_type, benchmark_type) + + # Parse benchmark results + results = [] + try: + parser = get_benchmark_parser(benchmark_type) + + # Find result directories + for entry in logs_dir.iterdir(): + if entry.is_dir() and "_isl_" in entry.name and "_osl_" in entry.name: + dir_results = parser.parse_result_directory(entry) + results.extend(dir_results) + + # Also check for AIPerf results + if hasattr(parser, "find_aiperf_results"): + for aiperf_path in parser.find_aiperf_results(logs_dir): + result = parser.parse_result_json(aiperf_path) + if result.get("output_tps") is not None or result.get("output_throughput") is not None: + results.append(result) + + except ValueError: + logger.warning("No benchmark parser for %s, available: %s", benchmark_type, list_benchmark_parsers()) + except Exception as e: + logger.warning("Failed to parse benchmark results: %s", e) + + if not results: + logger.warning("No benchmark results found in %s", logs_dir) + + # Parse node metrics + nodes_summary = None + node_parser = None + try: + node_parser = get_node_parser(backend_type) + nodes = node_parser.parse_logs(logs_dir) + if nodes: + nodes_summary = NodesSummary.from_node_metrics_list(nodes) + logger.info(" Found %d nodes (%d prefill, %d decode)", + len(nodes_summary.nodes), + nodes_summary.total_prefill_nodes, + nodes_summary.total_decode_nodes) + + # Parse launch commands for each node + for node_rollup in nodes_summary.nodes: + _add_launch_command_to_node(node_rollup, node_parser, logs_dir) + + except ValueError: + logger.debug("No node parser for %s, available: %s", backend_type, list_node_parsers()) + except Exception as e: + logger.warning("Failed to parse node metrics: %s", e) + + # Parse benchmark launch command + benchmark_command = None + benchmark_out = logs_dir / "benchmark.out" + if benchmark_out.exists(): + try: + parser = get_benchmark_parser(benchmark_type) + cmd = parser.parse_launch_command(benchmark_out.read_text(errors="replace")) + if cmd: + args = cmd.extra_args + benchmark_command = LaunchCommandRollup( + raw_command=cmd.raw_command, + command_type="benchmark", + model_path=args.get("model"), + benchmark_type=cmd.benchmark_type, + base_url=args.get("base_url"), + max_concurrency=args.get("max_concurrency"), + input_len=args.get("input_len"), + output_len=args.get("output_len"), + ) + except Exception as e: + logger.debug("Failed to parse benchmark command: %s", e) + + # Parse environment config + environment_config = None + try: + import yaml + env_config = EnvironmentConfig() + + backend_section = config.get("backend", {}) + if "prefill_environment" in backend_section: + env_config.prefill_environment = backend_section["prefill_environment"] + if "decode_environment" in backend_section: + env_config.decode_environment = backend_section["decode_environment"] + if "aggregated_environment" in backend_section: + env_config.aggregated_environment = backend_section["aggregated_environment"] + + # Load TRTLLM config files + prefill_yaml = logs_dir / "trtllm_config_prefill.yaml" + decode_yaml = logs_dir / "trtllm_config_decode.yaml" + + if prefill_yaml.exists(): + with open(prefill_yaml) as f: + env_config.prefill_engine_config = yaml.safe_load(f) + if decode_yaml.exists(): + with open(decode_yaml) as f: + env_config.decode_engine_config = yaml.safe_load(f) + + if any([env_config.prefill_environment, env_config.decode_environment, + env_config.prefill_engine_config, env_config.decode_engine_config]): + environment_config = env_config + + except Exception as e: + logger.debug("Failed to parse environment config: %s", e) + + # Build rollup summary + benchmark_config = config.get("benchmark", {}) + + # Compute total GPUs + if is_disaggregated: + prefill_gpus = resources.get("prefill_nodes", 0) * resources.get("gpus_per_node", 8) + decode_gpus = resources.get("decode_nodes", 0) * resources.get("gpus_per_node", 8) + total_gpus = prefill_gpus + decode_gpus + else: + total_gpus = resources.get("agg_nodes", 1) * resources.get("gpus_per_node", 8) + prefill_gpus = 0 + decode_gpus = 0 + + summary = RollupSummary( + job_id=job_id, + job_name=config.get("name", "unknown"), + generated_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + model_path=config.get("model", {}).get("path", ""), + model_name=model_name, + precision=config.get("model", {}).get("precision", "unknown"), + gpu_type=resources.get("gpu_type", "unknown"), + gpus_per_node=resources.get("gpus_per_node", 8), + backend_type=backend_type, + frontend_type=config.get("frontend", {}).get("type", "unknown"), + is_disaggregated=is_disaggregated, + total_nodes=resources.get("prefill_nodes", 0) + resources.get("decode_nodes", 0) if is_disaggregated else resources.get("agg_nodes", 1), + total_gpus=total_gpus, + prefill_nodes=resources.get("prefill_nodes") if is_disaggregated else None, + decode_nodes=resources.get("decode_nodes") if is_disaggregated else None, + prefill_workers=resources.get("prefill_workers") if is_disaggregated else None, + decode_workers=resources.get("decode_workers") if is_disaggregated else None, + prefill_gpus=prefill_gpus if is_disaggregated else None, + decode_gpus=decode_gpus if is_disaggregated else None, + agg_nodes=resources.get("agg_nodes") if not is_disaggregated else None, + agg_workers=resources.get("agg_workers") if not is_disaggregated else None, + benchmark_type=benchmark_type, + isl=benchmark_config.get("isl"), + osl=benchmark_config.get("osl"), + concurrencies=benchmark_config.get("concurrencies", []), + nodes_summary=nodes_summary, + environment_config=environment_config, + benchmark_command=benchmark_command, + tags=config.get("tags", []), + ) + + # Convert results to RollupResult objects + for data in results: + result = RollupResult( + concurrency=data.get("max_concurrency", 0), + output_tps=data.get("output_throughput", 0) or data.get("output_tps", 0), + total_tps=data.get("total_token_throughput"), + request_throughput=data.get("request_throughput"), + mean_ttft_ms=data.get("mean_ttft_ms"), + mean_tpot_ms=data.get("mean_tpot_ms"), + mean_itl_ms=data.get("mean_itl_ms"), + mean_e2el_ms=data.get("mean_e2el_ms"), + median_ttft_ms=data.get("median_ttft_ms"), + median_tpot_ms=data.get("median_tpot_ms"), + median_itl_ms=data.get("median_itl_ms"), + median_e2el_ms=data.get("median_e2el_ms"), + p99_ttft_ms=data.get("p99_ttft_ms"), + p99_tpot_ms=data.get("p99_tpot_ms"), + p99_itl_ms=data.get("p99_itl_ms"), + p99_e2el_ms=data.get("p99_e2el_ms"), + total_input_tokens=data.get("total_input_tokens"), + total_output_tokens=data.get("total_output_tokens"), + duration=data.get("duration"), + completed=data.get("completed"), + num_prompts=data.get("num_prompts"), + ) + summary.results.append(result) + + # Compute summary stats + summary.compute_summary_stats() + + # Write rollup + if output_dir: + rollup_path = output_dir / f"{job_id}_rollup.json" + else: + rollup_path = logs_dir / "rollup.json" + + rollup_path.parent.mkdir(parents=True, exist_ok=True) + with open(rollup_path, "w") as f: + json.dump(asdict(summary), f, indent=2, default=str) + + logger.info(" Wrote rollup to %s", rollup_path) + logger.info(" Results: %d, Max TPS: %.1f", len(summary.results), summary.max_output_tps or 0) + + return asdict(summary) + + +def main(): + parser = argparse.ArgumentParser( + description="Rollup harness for batch processing experiment logs", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Process all jobs in outputs directory + python -m analysis.srtlog.rollup_harness --log-dir /path/to/outputs + + # Dry run - just list jobs that would be processed + python -m analysis.srtlog.rollup_harness --log-dir /path/to/outputs --dry-run + + # Write rollups to a separate directory + python -m analysis.srtlog.rollup_harness --log-dir /path/to/outputs --output-dir /path/to/rollups + + # Process only specific jobs + python -m analysis.srtlog.rollup_harness --log-dir /path/to/outputs --jobs 585 586 587 + """, + ) + parser.add_argument("--log-dir", required=True, type=Path, help="Root directory to search for jobs") + parser.add_argument("--output-dir", type=Path, help="Output directory for rollup files (default: in-place)") + parser.add_argument("--dry-run", action="store_true", help="List jobs without processing") + parser.add_argument("--jobs", nargs="+", help="Only process specific job IDs") + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging") + + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + if not args.log_dir.exists(): + logger.error("Log directory does not exist: %s", args.log_dir) + sys.exit(1) + + # Find all job directories + job_dirs = find_job_directories(args.log_dir) + logger.info("Found %d job directories in %s", len(job_dirs), args.log_dir) + + # Filter by job IDs if specified + if args.jobs: + job_dirs = [d for d in job_dirs if d.name in args.jobs] + logger.info("Filtered to %d jobs: %s", len(job_dirs), [d.name for d in job_dirs]) + + if args.dry_run: + print("\nJob directories found:") + for job_dir in job_dirs: + config = load_job_config(job_dir) + if config: + backend = config.get("backend", {}).get("type", "?") + benchmark = config.get("benchmark", {}).get("type", "?") + print(f" {job_dir.name}: backend={backend}, benchmark={benchmark}") + else: + print(f" {job_dir.name}: (no config.yaml)") + return + + # Process each job + successful = 0 + failed = 0 + + for job_dir in job_dirs: + try: + result = run_rollup_on_job(job_dir, args.output_dir) + if result: + successful += 1 + else: + failed += 1 + except Exception as e: + logger.error("Failed to process %s: %s", job_dir, e) + failed += 1 + + logger.info("Complete: %d successful, %d failed", successful, failed) + + +if __name__ == "__main__": + main() + diff --git a/docs/architecture.md b/docs/architecture.md index afefa85d..e7b31d9e 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -65,7 +65,7 @@ srtctl abstracts this complexity into a simple YAML interface while providing ex v +------------------------------------------------------------------+ | ORCHESTRATION LAYER | -| SweepOrchestrator + Stage Mixins (Worker/Frontend/Bench) | +| SweepOrchestrator + Stage Mixins (Worker/Frontend/Bench/Rollup) | +------------------------------------------------------------------+ | | | v v v @@ -208,7 +208,7 @@ The main orchestration class that runs inside the SLURM job: ```python @dataclass -class SweepOrchestrator(WorkerStageMixin, FrontendStageMixin, BenchmarkStageMixin): +class SweepOrchestrator(WorkerStageMixin, FrontendStageMixin, BenchmarkStageMixin, RollupStageMixin): config: SrtConfig runtime: RuntimeContext @@ -396,6 +396,7 @@ src/srtctl/core/ | +-- WorkerStageMixin (start_worker, start_all_workers) | | +-- FrontendStageMixin (start_nginx, start_frontend) | | +-- BenchmarkStageMixin (run_benchmark) | +| +-- RollupStageMixin (run_rollup -> rollup.json) | | | | ProcessRegistry | ManagedProcess | Signal Handlers | | - add_process() | - name, popen | - SIGTERM/SIGINT | @@ -811,6 +812,281 @@ class ManagedProcess: --- +## Rollup Stage + +After benchmark completion, the **RollupStageMixin** consolidates all experiment data +into a single `rollup.json` file for easy analysis and comparison. + +### Architecture + +``` +┌─────────────────────────────────────────────────────────────────────────────────────┐ +│ ROLLUP STAGE ARCHITECTURE │ +└─────────────────────────────────────────────────────────────────────────────────────┘ + + ┌─────────────────┐ + │ SweepOrchestrator│ + │ run_rollup() │ + └────────┬────────┘ + │ + ┌────────────────────────┼────────────────────────┐ + │ │ │ + ▼ ▼ ▼ + ┌──────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ + │_collect_benchmark_ │ │ _collect_node_ │ │_collect_environment_ │ + │ results() │ │ metrics() │ │ config() │ + └──────────┬───────────┘ └──────────┬───────────┘ └──────────┬───────────┘ + │ │ │ + ▼ ▼ ▼ +┌───────────────────────────┐ ┌───────────────────────────┐ ┌───────────────────────┐ +│ BENCHMARK PARSERS │ │ NODE PARSERS │ │ CONFIG FILES │ +│ analysis.srtlog.parsers │ │ analysis.srtlog.parsers │ │ │ +│ .benchmark │ │ .nodes │ │ • config.yaml │ +├───────────────────────────┤ ├───────────────────────────┤ │ • trtllm_config_*.yaml│ +│ • SABenchParser │ │ • SGLangNodeParser │ └───────────────────────┘ +│ • MooncakeRouterParser │ │ • TRTLLMNodeParser │ +└───────────────────────────┘ └───────────────────────────┘ + │ + ▼ + ┌──────────────────────────────────────────┐ + │ RollupSummary │ + ├──────────────────────────────────────────┤ + │ • job_id, model_path, backend_type │ + │ • results: RollupResult[] (TPS, latency) │ + │ • nodes_summary: NodesSummary │ + │ • environment_config: EnvironmentConfig │ + │ • max_output_tps, min_mean_ttft_ms │ + └──────────────────────┬───────────────────┘ + │ + ▼ + ┌──────────────────┐ + │ rollup.json │ + └──────────────────┘ +``` + +### Key Dataclasses + +| Dataclass | Source File(s) | Purpose | +|-----------|----------------|---------| +| `RollupResult` | `logs/*_isl_*_osl_*/result.json` | Single benchmark result at one concurrency level (TPS, latencies) | +| `NodeRollup` | `logs/{node}_{type}_{id}.out/err` | Per-node metrics (batches, throughput, memory, KV cache) | +| `NodesSummary` | Aggregated from worker logs | Aggregated node statistics across all workers | +| `EnvironmentConfig` | `config.yaml`, `trtllm_config_*.yaml` | Environment variables and engine config for prefill/decode/agg | +| `LaunchCommandRollup` | Worker/benchmark log files | Parsed launch command parameters | +| `RollupSummary` | **Output:** `logs/rollup.json` | Complete experiment summary combining all above | + +### Parser Dataclasses + +Parsers return lightweight dataclasses with essential fields and an `extra_args` dict for parsed values: + +| Dataclass | Source | Fields | +|-----------|--------|--------| +| `BenchmarkLaunchCommand` | `logs/benchmark.out` | `benchmark_type`, `raw_command`, `extra_args` | +| `NodeLaunchCommand` | `logs/{node}_{type}_{id}.out/err` | `backend_type`, `worker_type`, `raw_command`, `extra_args` | + +The `extra_args` dict contains parsed parameters like `model`, `tp_size`, `max_concurrency`, etc. + +### Entity Relationship Diagram + +The following diagram shows how data flows from log files through parsers to the final rollup output: + +``` + ┌─────────────────────────────────────────────────────────────┐ + │ LOG FILES │ + └─────────────────────────────────────────────────────────────┘ + │ + ┌─────────────────────────────────────────┼─────────────────────────────────────────┐ + │ │ │ + ▼ ▼ ▼ + ┌───────────────────┐ ┌───────────────────┐ ┌───────────────────┐ + │ worker_*.out/err │ │ benchmark.out │ │ config.yaml │ + └───────────────────┘ └───────────────────┘ └───────────────────┘ + │ │ │ + ▼ ▼ ▼ + ┌───────────────────────────┐ ┌───────────────────────────┐ ┌───────────────────────────┐ + │ NodeParserProtocol │ │ BenchmarkParserProtocol │ │ YAML Loader │ + │ (sglang.py / trtllm.py) │ │ (sa_bench.py / etc.) │ │ │ + └───────────────────────────┘ └───────────────────────────┘ └───────────────────────────┘ + │ │ │ + ┌────────┴────────┐ ┌────────┴────────┐ │ + ▼ ▼ ▼ ▼ ▼ +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────────┐ +│ NodeMetrics │ │NodeLaunchCommand│ │ dict (result) │ │BenchmarkLaunch │ │ EnvironmentConfig │ +│ (models.py) │ │ (__init__.py) │ │ │ │ Command │ │ (rollup/models.py) │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ └─────────────────┘ └─────────────────────┘ + │ │ │ │ │ + └────────────────────┼─────────────────────┼────────────────────┼──────────────────────────┘ + │ │ │ + ▼ ▼ ▼ + ┌────────────────────────────────────────────────────────────┐ + │ RollupStageMixin │ + │ (rollup_stage.py / rollup_harness.py) │ + └────────────────────────────────────────────────────────────┘ + │ + │ transforms + ▼ + ┌───────────────┐ + │ RollupSummary │ + └───────┬───────┘ + │ + ▼ + ┌───────────────┐ + │ rollup.json │ + └───────────────┘ +``` + +### Detailed Entity Relationships + +``` +┌─────────────────────────────────────────────────────────────────────────────────────────┐ +│ PARSER LAYER (analysis/srtlog/parsers/) │ +├─────────────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌────────────────────┐ ┌────────────────────┐ ┌────────────────────┐ │ +│ │ NodeLaunchCommand │ │ BenchmarkLaunch │ │ NodeMetrics │ │ +│ ├────────────────────┤ │ Command │ ├────────────────────┤ │ +│ │ backend_type: str │ ├────────────────────┤ │ node_info: dict │ │ +│ │ worker_type: str │ │ benchmark_type: str│ │ config: dict │ │ +│ │ raw_command: str │ │ raw_command: str │ │ batches: list │ │ +│ │ extra_args: dict │ │ extra_args: dict │ │ memory_snapshots │ │ +│ └────────────────────┘ └────────────────────┘ └────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────────────┐ +│ ROLLUP LAYER (srtctl/cli/mixins/rollup/) │ +├─────────────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │ +│ │ LaunchCommandRollup │ │ +│ ├─────────────────────────────────────────────────────────────────────────────────┤ │ +│ │ raw_command: str │ command_type: str ("worker" | "benchmark") │ │ +│ │ model_path: str | None │ served_model_name: str | None │ │ +│ │ worker_type: str | None │ backend_type: str | None │ │ +│ │ tp_size: int | None │ pp_size: int | None │ │ +│ │ dp_size: int | None │ ep_size: int | None │ │ +│ │ max_num_seqs: int | None │ max_model_len: int | None │ │ +│ └─────────────────────────────────────────────────────────────────────────────────┘ │ +│ │ 1:1 │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │ +│ │ NodeRollup │ │ +│ ├─────────────────────────────────────────────────────────────────────────────────┤ │ +│ │ node_name: str │ worker_type: str │ │ +│ │ worker_id: str │ tp_size, pp_size, dp_size, ep_size: int | None │ │ +│ │ launch_command: LaunchCommandRollup | None │ │ +│ │ avail_mem_gb: float | None │ kv_cache_gb: float | None │ │ +│ │ total_batches: int │ avg_gen_throughput: float | None │ │ +│ └─────────────────────────────────────────────────────────────────────────────────┘ │ +│ │ 1:N │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │ +│ │ NodesSummary │ │ +│ ├─────────────────────────────────────────────────────────────────────────────────┤ │ +│ │ nodes: list[NodeRollup] │ │ +│ │ total_prefill_nodes: int │ total_decode_nodes: int │ │ +│ │ total_agg_nodes: int │ total_kv_cache_gb: float | None │ │ +│ └─────────────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │ +│ │ RollupResult │ │ +│ ├─────────────────────────────────────────────────────────────────────────────────┤ │ +│ │ max_concurrency: int │ input_len: int │ output_len: int │ │ +│ │ output_tps: float │ request_tps: float │ total_tokens: int │ │ +│ │ avg_ttft_ms: float │ avg_tpot_ms: float │ p50/p90/p99 metrics │ │ +│ └─────────────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │ +│ │ EnvironmentConfig │ │ +│ ├─────────────────────────────────────────────────────────────────────────────────┤ │ +│ │ prefill_environment: dict │ decode_environment: dict │ │ +│ │ aggregated_environment: dict│ prefill_engine_config: dict │ │ +│ │ decode_engine_config: dict │ aggregated_engine_config: dict │ │ +│ └─────────────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌──────────────────────┬──────────────────────┐ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │ +│ │ RollupSummary │ │ +│ ├─────────────────────────────────────────────────────────────────────────────────┤ │ +│ │ job_id: str │ model_name: str │ backend_type: str │ │ +│ │ benchmark_type: str │ is_disaggregated: bool │ total_nodes: int │ │ +│ │ max_output_tps: float │ max_request_tps: float │ │ +│ │ results: list[RollupResult] │ │ +│ │ nodes_summary: NodesSummary | None │ │ +│ │ benchmark_command: LaunchCommandRollup | None │ │ +│ │ environment_config: EnvironmentConfig | None │ │ +│ │ tags: dict │ timestamp: str │ │ +│ └─────────────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +### Cardinality Summary + +| Parent | Child | Relationship | +|--------|-------|--------------| +| `RollupSummary` | `RollupResult` | 1:N (one per concurrency level) | +| `RollupSummary` | `NodesSummary` | 1:1 | +| `RollupSummary` | `LaunchCommandRollup` | 1:1 (benchmark command) | +| `RollupSummary` | `EnvironmentConfig` | 1:1 | +| `NodesSummary` | `NodeRollup` | 1:N (one per worker) | +| `NodeRollup` | `LaunchCommandRollup` | 1:1 (worker command) | + +### Modular Parser System + +Parsers are registered via decorators and accessed through a registry: + +```python +# Register a new parser +@register_benchmark_parser("my-bench") +class MyBenchParser: + def parse_result_json(self, path: Path) -> dict: ... + def parse_launch_command(self, log: str) -> BenchmarkLaunchCommand: ... + +# Use the parser +from analysis.srtlog.parsers import get_benchmark_parser, get_node_parser + +bench_parser = get_benchmark_parser("sa-bench") +node_parser = get_node_parser("trtllm") +``` + +### Output Format + +The `rollup.json` file contains: + +```json +{ + "job_id": "12345", + "model_path": "/model/llama-70b", + "backend_type": "trtllm", + + "results": [ + {"concurrency": 16, "output_tps": 2500.0, "mean_ttft_ms": 45.2}, + {"concurrency": 32, "output_tps": 4000.0, "mean_ttft_ms": 52.1} + ], + + "nodes_summary": { + "total_prefill_nodes": 1, + "total_decode_nodes": 7, + "avg_decode_gen_throughput": 533.1, + "nodes": [{"node_name": "worker-0", "total_batches": 5531, ...}] + }, + + "environment_config": { + "prefill_environment": {"UCX_TLS": "...", "TRTLLM_ENABLE_PDL": "1"}, + "prefill_engine_config": {"tensor_parallel_size": 8, "max_batch_size": 2} + }, + + "max_output_tps": 4000.0, + "min_mean_ttft_ms": 45.2 +} +``` + +--- + ## Extension Points ### How to Add a New Backend @@ -1052,6 +1328,7 @@ src/srtctl/ | |-- worker_stage.py # Backend worker startup | |-- frontend_stage.py # Frontend/nginx startup | |-- benchmark_stage.py # Benchmark execution +| |-- rollup_stage.py # Experiment data consolidation | |-- benchmarks/ # Benchmark runners | |-- __init__.py # Registry and exports @@ -1072,6 +1349,21 @@ src/srtctl/ |-- templates/ # Jinja2 templates |-- job_script_minimal.j2 # sbatch script template |-- nginx.conf.j2 # nginx load balancer config + +analysis/srtlog/ # Log analysis and parsing +|-- __init__.py +|-- models.py # NodeMetrics, BatchMetrics, MemoryMetrics +|-- log_parser.py # Legacy NodeAnalyzer +|-- parsers/ # Modular parser system + |-- __init__.py # Parser registry and protocols + |-- benchmark/ # Benchmark result parsers + | |-- __init__.py + | |-- sa_bench.py # SA-Bench result parser + | |-- mooncake_router.py# Mooncake router parser + |-- nodes/ # Worker log parsers + |-- __init__.py + |-- sglang.py # SGLang log parser (DP/TP tags) + |-- trtllm.py # TRTLLM log parser (iteration logs) ``` --- @@ -1082,9 +1374,11 @@ srtctl is a well-architected orchestration framework with: - **Clean separation of concerns**: Config, runtime, backend, frontend, benchmark layers - **Strong typing**: Frozen dataclasses with marshmallow validation -- **Extensibility**: Protocol-based backends/frontends, decorator-based benchmark registration +- **Extensibility**: Protocol-based backends/frontends, decorator-based benchmark/parser registration - **Robust process management**: Registry, monitoring, graceful cleanup - **SLURM integration**: Proper container mounts, srun launching, nodelist parsing +- **Experiment consolidation**: Rollup stage aggregates results, node metrics, and configs into rollup.json +- **Modular parsing**: Pluggable parsers for different benchmark types and backend log formats - **Modern Python**: 3.10+ syntax, comprehensive type hints, clear module structure The codebase follows Python best practices and provides a solid foundation for orchestrating complex LLM inference workloads on SLURM clusters. diff --git a/pyproject.toml b/pyproject.toml index f2f6a6e4..21dda7ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "requests>=2.31.0", "rich>=13.0.0", "questionary>=2.0.0", + "pandas>=2.3.3", ] [project.scripts] diff --git a/src/srtctl/README.md b/src/srtctl/README.md index 8b0f7988..373b69bc 100644 --- a/src/srtctl/README.md +++ b/src/srtctl/README.md @@ -11,7 +11,12 @@ srtctl/ ├── cli/ │ ├── submit.py # srtctl apply - job submission │ ├── do_sweep.py # srtctl-sweep - main orchestrator -│ └── setup_head.py # Head node infrastructure (NATS/etcd) +│ ├── setup_head.py # Head node infrastructure (NATS/etcd) +│ └── mixins/ +│ ├── worker_stage.py # Worker startup mixin +│ ├── frontend_stage.py# Frontend/NGINX startup mixin +│ ├── benchmark_stage.py# Benchmark execution mixin +│ └── rollup_stage.py # Experiment data consolidation ├── core/ │ ├── config.py # Config loading and srtslurm.yaml resolution │ ├── runtime.py # RuntimeContext - single source of truth @@ -26,7 +31,8 @@ srtctl/ │ └── get_node_ip.sh # IP detection bash functions ├── backends/ │ ├── base.py # BackendProtocol interface -│ └── sglang.py # SGLang implementation +│ ├── sglang.py # SGLang implementation +│ └── trtllm.py # TensorRT-LLM implementation ├── benchmarks/ │ ├── base.py # BenchmarkRunner ABC │ ├── sa_bench.py # Serving benchmark @@ -130,17 +136,119 @@ resources: `CUDA_VISIBLE_DEVICES` is automatically set per worker (e.g., `0,1,2,3` and `4,5,6,7`). +### Rollup Stage + +After benchmark completion, the rollup stage consolidates all experiment data into +a single `rollup.json` file for easy analysis: + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ ROLLUP STAGE PIPELINE │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ INPUT FILES OUTPUT │ +│ ─────────── ────── │ +│ • benchmark results (*.json) ──┐ │ +│ • worker logs (*.out/*.err) ──┼──► rollup.json │ +│ • config.yaml ──┤ • benchmark results │ +│ • engine configs (*.yaml) ──┘ • node metrics │ +│ • environment config │ +│ • summary statistics │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +**Data collected:** + +| Component | Source | Output | +|-----------|--------|--------| +| Benchmark Results | `sa-bench_*/results_*.json` | `RollupResult[]` with TPS, latencies | +| Node Metrics | `worker-*.out` logs | `NodesSummary` with batches, memory, throughput | +| Environment Config | `config.yaml`, `trtllm_config_*.yaml` | `EnvironmentConfig` with env vars, engine settings | +| Launch Commands | `benchmark.out`, worker logs | Parsed command parameters | + +**Modular Parser System:** + +The rollup uses pluggable parsers from `analysis.srtlog.parsers`: + +```python +# Benchmark parsers (parse result JSON files) +from analysis.srtlog.parsers import get_benchmark_parser +parser = get_benchmark_parser("sa-bench") # or "mooncake-router" +results = parser.parse_result_directory(log_dir) + +# Node parsers (parse worker log files) +from analysis.srtlog.parsers import get_node_parser +parser = get_node_parser("trtllm") # or "sglang" +nodes = parser.parse_logs(log_dir) +``` + +**Example rollup.json structure:** + +```json +{ + "job_id": "12345", + "job_name": "disagg-benchmark", + "model_path": "/model/llama-70b", + "backend_type": "trtllm", + + "results": [ + {"concurrency": 16, "output_tps": 2500.0, "mean_ttft_ms": 45.2, ...}, + {"concurrency": 32, "output_tps": 4000.0, "mean_ttft_ms": 52.1, ...} + ], + + "nodes_summary": { + "total_prefill_nodes": 1, + "total_decode_nodes": 7, + "avg_decode_gen_throughput": 533.1, + "total_kv_cache_gb": 325.0, + "nodes": [ + { + "node_name": "worker-0", + "worker_type": "prefill", + "total_batches": 1523, + "avg_input_throughput": 21565.5, + "mem_usage_gb": 91.46, + "kv_cache_gb": 41.19 + } + ] + }, + + "environment_config": { + "prefill_environment": {"UCX_TLS": "rc,dc,ud,...", "TRTLLM_ENABLE_PDL": "1"}, + "decode_environment": {"UCX_TLS": "rc,dc,ud,..."}, + "prefill_engine_config": {"tensor_parallel_size": 8, "max_batch_size": 2}, + "decode_engine_config": {"tensor_parallel_size": 8, "max_batch_size": 32} + }, + + "max_output_tps": 4000.0, + "min_mean_ttft_ms": 45.2 +} +``` + ## Files Overview -| File | Purpose | -| -------------------- | ---------------------------------------- | -| `core/config.py` | YAML loading, srtslurm.yaml resolution | -| `core/runtime.py` | Computed paths/values (RuntimeContext) | -| `core/topology.py` | Worker topology and GPU allocation | -| `core/processes.py` | Process lifecycle management | -| `core/slurm.py` | SLURM srun launching, node IP resolution | -| `core/health.py` | Health checks, worker readiness polling | -| `core/ip_utils/` | Bash-based IP detection utilities | -| `cli/do_sweep.py` | Main orchestrator (runs on head node) | -| `backends/sglang.py` | SGLang backend implementation | -| `benchmarks/base.py` | BenchmarkRunner ABC | +| File | Purpose | +| ---- | ------- | +| `core/config.py` | YAML loading, srtslurm.yaml resolution | +| `core/runtime.py` | Computed paths/values (RuntimeContext) | +| `core/topology.py` | Worker topology and GPU allocation | +| `core/processes.py` | Process lifecycle management | +| `core/slurm.py` | SLURM srun launching, node IP resolution | +| `core/health.py` | Health checks, worker readiness polling | +| `core/ip_utils/` | Bash-based IP detection utilities | +| `cli/do_sweep.py` | Main orchestrator (runs on head node) | +| `cli/mixins/rollup_stage.py` | Experiment data consolidation to rollup.json | +| `backends/sglang.py` | SGLang backend implementation | +| `backends/trtllm.py` | TensorRT-LLM backend implementation | +| `benchmarks/base.py` | BenchmarkRunner ABC | + +### Related Analysis Modules + +| File | Purpose | +| ---- | ------- | +| `analysis/srtlog/parsers/__init__.py` | Parser registry and protocols | +| `analysis/srtlog/parsers/benchmark/sa_bench.py` | SA-Bench result parser | +| `analysis/srtlog/parsers/benchmark/mooncake_router.py` | Mooncake router result parser | +| `analysis/srtlog/parsers/nodes/sglang.py` | SGLang worker log parser | +| `analysis/srtlog/parsers/nodes/trtllm.py` | TRTLLM worker log parser | +| `analysis/srtlog/models.py` | Data models (NodeMetrics, BatchMetrics, etc.) | diff --git a/src/srtctl/benchmarks/scripts/gpqa/bench.sh b/src/srtctl/benchmarks/scripts/gpqa/bench.sh index 01670aa2..a0d563d0 100644 --- a/src/srtctl/benchmarks/scripts/gpqa/bench.sh +++ b/src/srtctl/benchmarks/scripts/gpqa/bench.sh @@ -26,14 +26,9 @@ export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" echo "Running GPQA evaluation..." -python3 -m sglang.test.run_eval \ - --base-url "${ENDPOINT}" \ - --model "${MODEL_NAME}" \ - --eval-name gpqa \ - --num-examples "${NUM_EXAMPLES}" \ - --max-tokens "${MAX_TOKENS}" \ - --repeat "${REPEAT}" \ - --num-threads "${NUM_THREADS}" +command="python3 -m sglang.test.run_eval --base-url ${ENDPOINT} --model ${MODEL_NAME} --eval-name gpqa --num-examples ${NUM_EXAMPLES} --max-tokens ${MAX_TOKENS} --repeat ${REPEAT} --num-threads ${NUM_THREADS}" +echo "[CMD] $command" +eval $command # Copy result file result_file=$(ls -t /tmp/gpqa_*.json 2>/dev/null | head -n1) @@ -45,4 +40,3 @@ else fi echo "GPQA evaluation complete" - diff --git a/src/srtctl/benchmarks/scripts/longbenchv2/bench.sh b/src/srtctl/benchmarks/scripts/longbenchv2/bench.sh index 7a1643b8..bad38618 100644 --- a/src/srtctl/benchmarks/scripts/longbenchv2/bench.sh +++ b/src/srtctl/benchmarks/scripts/longbenchv2/bench.sh @@ -28,25 +28,19 @@ export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" echo "Running LongBench-v2 evaluation..." # Build command -cmd="python3 -m sglang.test.run_eval \ - --base-url ${ENDPOINT} \ - --model ${MODEL_NAME} \ - --eval-name longbench_v2 \ - --max-tokens ${MAX_TOKENS} \ - --max-context-length ${MAX_CONTEXT_LENGTH} \ - --num-threads ${NUM_THREADS}" +command="python3 -m sglang.test.run_eval --base-url ${ENDPOINT} --model ${MODEL_NAME} --eval-name longbench_v2 --max-tokens ${MAX_TOKENS} --max-context-length ${MAX_CONTEXT_LENGTH} --num-threads ${NUM_THREADS}" # Add optional arguments if [ -n "$NUM_EXAMPLES" ]; then - cmd="$cmd --num-examples ${NUM_EXAMPLES}" + command="$command --num-examples ${NUM_EXAMPLES}" fi if [ -n "$CATEGORIES" ]; then - cmd="$cmd --categories ${CATEGORIES}" + command="$command --categories ${CATEGORIES}" fi -echo "Executing: $cmd" -eval "$cmd" +echo "[CMD] $command" +eval $command # Copy result files result_file=$(ls -t /tmp/longbench_v2_*.json 2>/dev/null | head -n1) @@ -64,4 +58,3 @@ if [ -f "$html_file" ]; then fi echo "LongBench-v2 evaluation complete" - diff --git a/src/srtctl/benchmarks/scripts/mmlu/bench.sh b/src/srtctl/benchmarks/scripts/mmlu/bench.sh index aff149ce..adbecafd 100644 --- a/src/srtctl/benchmarks/scripts/mmlu/bench.sh +++ b/src/srtctl/benchmarks/scripts/mmlu/bench.sh @@ -26,14 +26,9 @@ export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" echo "Running MMLU evaluation..." -python3 -m sglang.test.run_eval \ - --base-url "${ENDPOINT}" \ - --model "${MODEL_NAME}" \ - --eval-name mmlu \ - --num-examples "${NUM_EXAMPLES}" \ - --max-tokens "${MAX_TOKENS}" \ - --repeat "${REPEAT}" \ - --num-threads "${NUM_THREADS}" +command="python3 -m sglang.test.run_eval --base-url ${ENDPOINT} --model ${MODEL_NAME} --eval-name mmlu --num-examples ${NUM_EXAMPLES} --max-tokens ${MAX_TOKENS} --repeat ${REPEAT} --num-threads ${NUM_THREADS}" +echo "[CMD] $command" +eval $command # Copy result file result_file=$(ls -t /tmp/mmlu_*.json 2>/dev/null | head -n1) @@ -45,4 +40,3 @@ else fi echo "MMLU evaluation complete" - diff --git a/src/srtctl/benchmarks/scripts/mooncake-router/bench.sh b/src/srtctl/benchmarks/scripts/mooncake-router/bench.sh index e84d711c..723630a6 100644 --- a/src/srtctl/benchmarks/scripts/mooncake-router/bench.sh +++ b/src/srtctl/benchmarks/scripts/mooncake-router/bench.sh @@ -57,13 +57,9 @@ fi # Run small benchmark for warmup echo "Running small benchmark for warmup..." -aiperf profile \ - -m "${MODEL_NAME}" \ - --url "${ENDPOINT}" \ - --streaming \ - --ui simple \ - --concurrency 10 \ - --request-count 20 +command="aiperf profile -m ${MODEL_NAME} --url ${ENDPOINT} --streaming --ui simple --concurrency 10 --request-count 20" +echo "[CMD] $command" +eval $command echo "Small benchmark for warmup complete" # Setup artifact directory with model and timestamp @@ -80,17 +76,9 @@ echo "" echo "$(date '+%Y-%m-%d %H:%M:%S') - Starting benchmark" # Run aiperf profile exactly as dynamo does -aiperf profile \ - -m "${MODEL_NAME}" \ - --input-file "${INPUT_FILE}" \ - --custom-dataset-type mooncake_trace \ - --fixed-schedule \ - --url "${ENDPOINT}" \ - --streaming \ - --random-seed 42 \ - --ui simple \ - --artifact-dir "${RUN_ARTIFACT_DIR}" \ - --goodput "time_to_first_token:${TTFT_THRESHOLD} inter_token_latency:${ITL_THRESHOLD}" +command="aiperf profile -m ${MODEL_NAME} --input-file ${INPUT_FILE} --custom-dataset-type mooncake_trace --fixed-schedule --url ${ENDPOINT} --streaming --random-seed 42 --ui simple --artifact-dir ${RUN_ARTIFACT_DIR} --goodput \"time_to_first_token:${TTFT_THRESHOLD} inter_token_latency:${ITL_THRESHOLD}\"" +echo "[CMD] $command" +eval $command BENCH_EXIT_CODE=$? diff --git a/src/srtctl/benchmarks/scripts/profiling/profile.sh b/src/srtctl/benchmarks/scripts/profiling/profile.sh index 6b426c34..b971b883 100644 --- a/src/srtctl/benchmarks/scripts/profiling/profile.sh +++ b/src/srtctl/benchmarks/scripts/profiling/profile.sh @@ -130,27 +130,19 @@ done if [[ "${PROFILING_MODE}" == "prefill" ]]; then echo "" echo "Generating profiling traffic..." - python3 -m sglang.bench_serving \ - --backend sglang \ - --model "${model_name}" \ - --host "${head_node}" --port "${head_port}" \ - --dataset-name random \ - --max-concurrency "${PROFILE_CONCURRENCY}" \ - --num-prompts 128 \ - --random-input-len "${PROFILE_ISL}" \ - --random-output-len "${PROFILE_OSL}" \ - --random-range-ratio 1 \ - --warmup-request 0 + + command="python3 -m sglang.bench_serving --backend sglang --model ${model_name} --host ${head_node} --port ${head_port} --dataset-name random --max-concurrency ${PROFILE_CONCURRENCY} --num-prompts 128 --random-input-len ${PROFILE_ISL} --random-output-len ${PROFILE_OSL} --random-range-ratio 1 --warmup-request 0" + echo "[CMD] $command" + eval $command # Run lm-eval for additional profiling coverage echo "" echo "Running lm-eval..." pip install lm-eval tenacity > /dev/null 2>&1 - python -m lm_eval \ - --model local-completions \ - --tasks gsm8k \ - --model_args "base_url=http://${head_node}:${head_port}/v1/completions,model=${model_name},tokenized_requests=False,tokenizer_backend=None,num_concurrent=${PROFILE_CONCURRENCY},timeout=6000,max_retries=1" \ - --limit 10 + + command="python -m lm_eval --model local-completions --tasks gsm8k --model_args \"base_url=http://${head_node}:${head_port}/v1/completions,model=${model_name},tokenized_requests=False,tokenizer_backend=None,num_concurrent=${PROFILE_CONCURRENCY},timeout=6000,max_retries=1\" --limit 10" + echo "[CMD] $command" + eval $command fi exit_code=$? @@ -164,4 +156,3 @@ if [[ -n "${SGLANG_TORCH_PROFILER_DIR}" ]]; then fi exit ${exit_code} - diff --git a/src/srtctl/benchmarks/scripts/router/bench.sh b/src/srtctl/benchmarks/scripts/router/bench.sh index d559b85d..d51ed881 100644 --- a/src/srtctl/benchmarks/scripts/router/bench.sh +++ b/src/srtctl/benchmarks/scripts/router/bench.sh @@ -40,13 +40,8 @@ echo "Running prefix ratio benchmark..." echo "Results will be saved to: $result_dir" # shellcheck disable=SC2086 -python prefix_ratio_benchmark.py \ - --prefix-ratios $PREFIX_RATIOS \ - --isl "$ISL" \ - --osl "$OSL" \ - --requests "$REQUESTS" \ - --concurrency "$CONCURRENCY" \ - --output-dir "$result_dir" +command="python prefix_ratio_benchmark.py --prefix-ratios $PREFIX_RATIOS --isl $ISL --osl $OSL --requests $REQUESTS --concurrency $CONCURRENCY --output-dir $result_dir" +echo "[CMD] $command" +eval $command echo "Router benchmark complete. Results in $result_dir" - diff --git a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh index 99dd4022..8462556b 100644 --- a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh +++ b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh @@ -50,20 +50,11 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do echo "Warming up with concurrency $concurrency" echo "$(date '+%Y-%m-%d %H:%M:%S')" num_prompts=$((concurrency * 2)) - python3 -u "${WORK_DIR}/benchmark_serving.py" \ - --model "${MODEL_NAME}" --tokenizer "${MODEL_PATH}" \ - --host "$HOST" --port "$PORT" \ - --backend "dynamo" --endpoint /v1/completions \ - --disable-tqdm \ - --dataset-name random \ - --num-prompts "$num_prompts" \ - --random-input-len "$ISL" \ - --random-output-len "$OSL" \ - --random-range-ratio 0.8 \ - --ignore-eos \ - --request-rate 250 \ - --percentile-metrics ttft,tpot,itl,e2el \ - --max-concurrency "$concurrency" + + command="python3 -u ${WORK_DIR}/benchmark_serving.py --model ${MODEL_NAME} --tokenizer ${MODEL_PATH} --host $HOST --port $PORT --backend dynamo --endpoint /v1/completions --disable-tqdm --dataset-name random --num-prompts $num_prompts --random-input-len $ISL --random-output-len $OSL --random-range-ratio 0.8 --ignore-eos --request-rate 250 --percentile-metrics ttft,tpot,itl,e2el --max-concurrency $concurrency" + + echo "[CMD] $command" + eval $command done # Benchmark @@ -83,22 +74,10 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do echo "Running benchmark with concurrency: $concurrency" echo "$(date '+%Y-%m-%d %H:%M:%S')" - python3 -u "${WORK_DIR}/benchmark_serving.py" \ - --model "${MODEL_NAME}" --tokenizer "${MODEL_PATH}" \ - --host "$HOST" --port "$PORT" \ - --backend "dynamo" --endpoint /v1/completions \ - --disable-tqdm \ - --dataset-name random \ - --num-prompts "$num_prompts" \ - --random-input-len "$ISL" \ - --random-output-len "$OSL" \ - --random-range-ratio 0.8 \ - --ignore-eos \ - --request-rate "${REQ_RATE}" \ - --percentile-metrics ttft,tpot,itl,e2el \ - --max-concurrency "$concurrency" \ - --use-chat-template \ - --save-result --result-dir "$result_dir" --result-filename "$result_filename" + command="python3 -u ${WORK_DIR}/benchmark_serving.py --model ${MODEL_NAME} --tokenizer ${MODEL_PATH} --host $HOST --port $PORT --backend dynamo --endpoint /v1/completions --disable-tqdm --dataset-name random --num-prompts $num_prompts --random-input-len $ISL --random-output-len $OSL --random-range-ratio 0.8 --ignore-eos --request-rate ${REQ_RATE} --percentile-metrics ttft,tpot,itl,e2el --max-concurrency $concurrency --use-chat-template --save-result --result-dir $result_dir --result-filename $result_filename" + + echo "[CMD] $command" + eval $command echo "$(date '+%Y-%m-%d %H:%M:%S')" echo "Completed benchmark with concurrency: $concurrency" @@ -106,4 +85,3 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do done echo "SA-Bench complete. Results in $result_dir" - diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py index 9cb3e577..8dbe6624 100644 --- a/src/srtctl/cli/do_sweep.py +++ b/src/srtctl/cli/do_sweep.py @@ -21,7 +21,7 @@ from dataclasses import dataclass from pathlib import Path -from srtctl.cli.mixins import BenchmarkStageMixin, FrontendStageMixin, WorkerStageMixin +from srtctl.cli.mixins import BenchmarkStageMixin, FrontendStageMixin, RollupStageMixin, WorkerStageMixin from srtctl.core.config import load_config from srtctl.core.health import wait_for_port from srtctl.core.processes import ( @@ -40,7 +40,7 @@ @dataclass -class SweepOrchestrator(WorkerStageMixin, FrontendStageMixin, BenchmarkStageMixin): +class SweepOrchestrator(WorkerStageMixin, FrontendStageMixin, BenchmarkStageMixin, RollupStageMixin): """Main orchestrator for benchmark sweeps. Usage: @@ -208,6 +208,11 @@ def run(self) -> int: exit_code = self.run_benchmark(registry, stop_event) + # Run rollup to consolidate experiment data + if exit_code == 0: + tags = self.config.tags if hasattr(self.config, "tags") else [] + self.run_rollup(tags=tags) + except Exception as e: logger.exception("Error during sweep: %s", e) exit_code = 1 diff --git a/src/srtctl/cli/mixins/__init__.py b/src/srtctl/cli/mixins/__init__.py index c2149014..feb1bae9 100644 --- a/src/srtctl/cli/mixins/__init__.py +++ b/src/srtctl/cli/mixins/__init__.py @@ -8,14 +8,17 @@ - WorkerStageMixin: Backend worker process startup - FrontendStageMixin: Frontend/nginx orchestration - BenchmarkStageMixin: Benchmark execution +- RollupStageMixin: Experiment data consolidation """ from srtctl.cli.mixins.benchmark_stage import BenchmarkStageMixin from srtctl.cli.mixins.frontend_stage import FrontendStageMixin +from srtctl.cli.mixins.rollup_stage import RollupStageMixin from srtctl.cli.mixins.worker_stage import WorkerStageMixin __all__ = [ "WorkerStageMixin", "FrontendStageMixin", "BenchmarkStageMixin", + "RollupStageMixin", ] diff --git a/src/srtctl/cli/mixins/rollup/__init__.py b/src/srtctl/cli/mixins/rollup/__init__.py new file mode 100644 index 00000000..9444b5b5 --- /dev/null +++ b/src/srtctl/cli/mixins/rollup/__init__.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Rollup dataclasses for experiment data consolidation. + +This module provides dataclasses for: +- RollupResult: Single benchmark result at one concurrency level +- RollupSummary: Complete experiment summary +- NodeRollup: Single worker node metrics +- NodesSummary: Summary of all worker nodes +- EnvironmentConfig: Environment variables and engine config +- LaunchCommandRollup: Parsed launch command information +""" + +from srtctl.cli.mixins.rollup.models import ( + EnvironmentConfig, + LaunchCommandRollup, + NodeRollup, + NodesSummary, + RollupResult, + RollupSummary, +) + +__all__ = [ + "RollupResult", + "RollupSummary", + "NodeRollup", + "NodesSummary", + "EnvironmentConfig", + "LaunchCommandRollup", +] diff --git a/src/srtctl/cli/mixins/rollup/models.py b/src/srtctl/cli/mixins/rollup/models.py new file mode 100644 index 00000000..c83bc101 --- /dev/null +++ b/src/srtctl/cli/mixins/rollup/models.py @@ -0,0 +1,566 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Rollup dataclasses for experiment data consolidation. + +These models represent the structure of rollup.json output. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from analysis.srtlog.models import NodeMetrics + + +@dataclass +class LaunchCommandRollup: + """Parsed launch command information for a worker or benchmark. + + Source (worker): logs/{node}_{worker_type}_{worker_id}.out or .err + Source (benchmark): logs/benchmark.out + """ + + raw_command: str + command_type: str # "worker" or "benchmark" + + # Common fields + model_path: str | None = None + served_model_name: str | None = None + + # Worker-specific fields + worker_type: str | None = None # prefill, decode, agg + backend_type: str | None = None + disaggregation_mode: str | None = None + tp_size: int | None = None + pp_size: int | None = None + dp_size: int | None = None + ep_size: int | None = None + port: int | None = None + max_num_seqs: int | None = None + max_model_len: int | None = None + + # Benchmark-specific fields + benchmark_type: str | None = None + base_url: str | None = None + max_concurrency: int | None = None + num_prompts: int | None = None + input_len: int | None = None + output_len: int | None = None + + +@dataclass +class NodeRollup: + """Summary of metrics for a single worker node. + + Source: logs/{node}_{worker_type}_{worker_id}.out and .err + + Derived from analysis.srtlog.models.NodeMetrics with aggregated statistics. + """ + + node_name: str + worker_type: str # "prefill", "decode", or "agg" + worker_id: str + + # Configuration (from NodeMetrics.config) + tp_size: int | None = None + pp_size: int | None = None + dp_size: int | None = None + ep_size: int | None = None + + # Launch command (parsed from log) + launch_command: LaunchCommandRollup | None = None + + # Memory metrics (from NodeMetrics.memory_snapshots) + avail_mem_gb: float | None = None + mem_usage_gb: float | None = None + kv_cache_gb: float | None = None + kv_tokens: int | None = None + + # Batch statistics (aggregated from NodeMetrics.batches) + total_batches: int = 0 + total_prefill_batches: int = 0 + total_decode_batches: int = 0 + + # Prefill-specific stats (also used by agg workers) + total_new_tokens: int | None = None + total_cached_tokens: int | None = None + cache_hit_rate: float | None = None # Percentage + avg_input_throughput: float | None = None # tokens/s + max_input_throughput: float | None = None # tokens/s + + # Decode-specific stats (also used by agg workers) + avg_running_requests: float | None = None + max_running_requests: int | None = None + avg_gen_throughput: float | None = None # tokens/s + max_gen_throughput: float | None = None # tokens/s + + # Queue stats + max_queue_requests: int | None = None + max_inflight_requests: int | None = None + max_transfer_requests: int | None = None + + @property + def is_agg(self) -> bool: + """Check if this is an aggregated worker.""" + return self.worker_type == "agg" + + @classmethod + def from_node_metrics(cls, node: NodeMetrics) -> NodeRollup: + """Create NodeRollup from analysis.srtlog.models.NodeMetrics. + + Args: + node: NodeMetrics object from NodeAnalyzer + + Returns: + NodeRollup with aggregated statistics + """ + worker_type = node.node_info.get("worker_type", "unknown") + + rollup = cls( + node_name=node.node_info.get("node", "unknown"), + worker_type=worker_type, + worker_id=node.node_info.get("worker_id", ""), + tp_size=node.config.get("tp_size"), + pp_size=node.config.get("pp_size"), + dp_size=node.config.get("dp_size"), + ep_size=node.config.get("ep_size"), + total_batches=len(node.batches), + ) + + # Extract memory metrics - aggregate best values from all snapshots + if node.memory_snapshots: + # Find best values across all snapshots (some may be partial) + for mem in node.memory_snapshots: + if mem.avail_mem_gb is not None and rollup.avail_mem_gb is None: + rollup.avail_mem_gb = mem.avail_mem_gb + if mem.mem_usage_gb is not None and rollup.mem_usage_gb is None: + rollup.mem_usage_gb = mem.mem_usage_gb + if mem.kv_cache_gb is not None: + # Take the max kv_cache seen (or sum for multiple allocations) + if rollup.kv_cache_gb is None: + rollup.kv_cache_gb = mem.kv_cache_gb + else: + rollup.kv_cache_gb = max(rollup.kv_cache_gb, mem.kv_cache_gb) + if mem.kv_tokens is not None: + # Take the max kv_tokens + if rollup.kv_tokens is None: + rollup.kv_tokens = mem.kv_tokens + else: + rollup.kv_tokens = max(rollup.kv_tokens, mem.kv_tokens) + + # Aggregate batch metrics based on worker type + if node.batches: + # Check if we have mixed batch types (e.g., TRTLLM decode workers have both) + batch_types = {b.batch_type for b in node.batches} + has_mixed = "prefill" in batch_types and "decode" in batch_types + + if worker_type == "agg" or has_mixed: + # Agg workers or workers with mixed batches need full aggregation + rollup._aggregate_agg_batches(node.batches) + elif node.is_prefill: + rollup._aggregate_prefill_batches(node.batches) + elif node.is_decode: + rollup._aggregate_decode_batches(node.batches) + + return rollup + + def _aggregate_prefill_batches(self, batches: list) -> None: + """Aggregate prefill batch metrics.""" + self.total_prefill_batches = len(batches) + + new_tokens = [] + cached_tokens = [] + input_throughputs = [] + queue_reqs = [] + inflight_reqs = [] + + for batch in batches: + if batch.new_token is not None: + new_tokens.append(batch.new_token) + if batch.cached_token is not None: + cached_tokens.append(batch.cached_token) + if batch.input_throughput is not None: + input_throughputs.append(batch.input_throughput) + if batch.queue_req is not None: + queue_reqs.append(batch.queue_req) + if batch.inflight_req is not None: + inflight_reqs.append(batch.inflight_req) + + if new_tokens: + self.total_new_tokens = sum(new_tokens) + if cached_tokens: + self.total_cached_tokens = sum(cached_tokens) + + # Compute cache hit rate + if self.total_new_tokens is not None and self.total_cached_tokens is not None: + total = self.total_new_tokens + self.total_cached_tokens + if total > 0: + self.cache_hit_rate = (self.total_cached_tokens / total) * 100 + + if input_throughputs: + self.avg_input_throughput = sum(input_throughputs) / len(input_throughputs) + self.max_input_throughput = max(input_throughputs) + + if queue_reqs: + self.max_queue_requests = max(queue_reqs) + if inflight_reqs: + self.max_inflight_requests = max(inflight_reqs) + + def _aggregate_decode_batches(self, batches: list) -> None: + """Aggregate decode batch metrics.""" + self.total_decode_batches = len(batches) + + running_reqs = [] + gen_throughputs = [] + queue_reqs = [] + transfer_reqs = [] + + for batch in batches: + if batch.running_req is not None: + running_reqs.append(batch.running_req) + if batch.gen_throughput is not None: + gen_throughputs.append(batch.gen_throughput) + if batch.queue_req is not None: + queue_reqs.append(batch.queue_req) + if batch.transfer_req is not None: + transfer_reqs.append(batch.transfer_req) + + if running_reqs: + self.avg_running_requests = sum(running_reqs) / len(running_reqs) + self.max_running_requests = max(running_reqs) + + if gen_throughputs: + self.avg_gen_throughput = sum(gen_throughputs) / len(gen_throughputs) + self.max_gen_throughput = max(gen_throughputs) + + if queue_reqs: + self.max_queue_requests = max(queue_reqs) + if transfer_reqs: + self.max_transfer_requests = max(transfer_reqs) + + def _aggregate_agg_batches(self, batches: list) -> None: + """Aggregate metrics for agg workers (handles both prefill and decode batches).""" + # Separate prefill and decode batches + prefill_batches = [b for b in batches if b.batch_type == "prefill"] + decode_batches = [b for b in batches if b.batch_type == "decode"] + + self.total_prefill_batches = len(prefill_batches) + self.total_decode_batches = len(decode_batches) + + # Aggregate prefill metrics + if prefill_batches: + new_tokens = [] + cached_tokens = [] + input_throughputs = [] + inflight_reqs = [] + + for batch in prefill_batches: + if batch.new_token is not None: + new_tokens.append(batch.new_token) + if batch.cached_token is not None: + cached_tokens.append(batch.cached_token) + if batch.input_throughput is not None: + input_throughputs.append(batch.input_throughput) + if batch.inflight_req is not None: + inflight_reqs.append(batch.inflight_req) + + if new_tokens: + self.total_new_tokens = sum(new_tokens) + if cached_tokens: + self.total_cached_tokens = sum(cached_tokens) + + # Compute cache hit rate + if self.total_new_tokens is not None and self.total_cached_tokens is not None: + total = self.total_new_tokens + self.total_cached_tokens + if total > 0: + self.cache_hit_rate = (self.total_cached_tokens / total) * 100 + + if input_throughputs: + self.avg_input_throughput = sum(input_throughputs) / len(input_throughputs) + self.max_input_throughput = max(input_throughputs) + + if inflight_reqs: + self.max_inflight_requests = max(inflight_reqs) + + # Aggregate decode metrics + if decode_batches: + running_reqs = [] + gen_throughputs = [] + queue_reqs = [] + transfer_reqs = [] + + for batch in decode_batches: + if batch.running_req is not None: + running_reqs.append(batch.running_req) + if batch.gen_throughput is not None: + gen_throughputs.append(batch.gen_throughput) + if batch.queue_req is not None: + queue_reqs.append(batch.queue_req) + if batch.transfer_req is not None: + transfer_reqs.append(batch.transfer_req) + + if running_reqs: + self.avg_running_requests = sum(running_reqs) / len(running_reqs) + self.max_running_requests = max(running_reqs) + + if gen_throughputs: + self.avg_gen_throughput = sum(gen_throughputs) / len(gen_throughputs) + self.max_gen_throughput = max(gen_throughputs) + + if queue_reqs: + self.max_queue_requests = max(queue_reqs) + if transfer_reqs: + self.max_transfer_requests = max(transfer_reqs) + + +@dataclass +class NodesSummary: + """Summary of all worker nodes in the experiment. + + Source: Aggregated from logs/{node}_{worker_type}_{worker_id}.out and .err files + """ + + # Counts + total_prefill_nodes: int = 0 + total_decode_nodes: int = 0 + total_agg_nodes: int = 0 + + # Aggregated prefill stats (from prefill + agg nodes) + total_prefill_tokens: int | None = None + total_cached_tokens: int | None = None + overall_cache_hit_rate: float | None = None # Percentage + avg_prefill_input_throughput: float | None = None # tokens/s per node + max_prefill_input_throughput: float | None = None # tokens/s peak + + # Aggregated decode stats (from decode + agg nodes) + avg_decode_gen_throughput: float | None = None # tokens/s per node + max_decode_gen_throughput: float | None = None # tokens/s peak + + # Memory summary + total_kv_cache_gb: float | None = None + + # Per-node details + nodes: list[NodeRollup] = field(default_factory=list) + + @classmethod + def from_node_metrics_list(cls, nodes: list[NodeMetrics]) -> NodesSummary: + """Create NodesSummary from a list of NodeMetrics. + + Args: + nodes: List of NodeMetrics from NodeAnalyzer.parse_run_logs() + + Returns: + NodesSummary with aggregated statistics + """ + summary = cls() + + # Convert each NodeMetrics to NodeRollup + for node in nodes: + rollup = NodeRollup.from_node_metrics(node) + summary.nodes.append(rollup) + + worker_type = node.node_info.get("worker_type", "unknown") + if worker_type == "agg": + summary.total_agg_nodes += 1 + elif node.is_prefill: + summary.total_prefill_nodes += 1 + elif node.is_decode: + summary.total_decode_nodes += 1 + + # Aggregate across all nodes + summary._compute_aggregate_stats() + + return summary + + def _compute_aggregate_stats(self) -> None: + """Compute aggregate statistics across all nodes.""" + # Prefill aggregation (includes both prefill and agg nodes) + prefill_capable_nodes = [n for n in self.nodes if n.worker_type in ("prefill", "agg")] + if prefill_capable_nodes: + total_new = sum(n.total_new_tokens or 0 for n in prefill_capable_nodes) + total_cached = sum(n.total_cached_tokens or 0 for n in prefill_capable_nodes) + + if total_new > 0 or total_cached > 0: + self.total_prefill_tokens = total_new + self.total_cached_tokens = total_cached + total = total_new + total_cached + if total > 0: + self.overall_cache_hit_rate = (total_cached / total) * 100 + + throughputs = [n.avg_input_throughput for n in prefill_capable_nodes if n.avg_input_throughput] + if throughputs: + self.avg_prefill_input_throughput = sum(throughputs) / len(throughputs) + + max_throughputs = [n.max_input_throughput for n in prefill_capable_nodes if n.max_input_throughput] + if max_throughputs: + self.max_prefill_input_throughput = max(max_throughputs) + + # Decode aggregation (includes both decode and agg nodes) + decode_capable_nodes = [n for n in self.nodes if n.worker_type in ("decode", "agg")] + if decode_capable_nodes: + throughputs = [n.avg_gen_throughput for n in decode_capable_nodes if n.avg_gen_throughput] + if throughputs: + self.avg_decode_gen_throughput = sum(throughputs) / len(throughputs) + + max_throughputs = [n.max_gen_throughput for n in decode_capable_nodes if n.max_gen_throughput] + if max_throughputs: + self.max_decode_gen_throughput = max(max_throughputs) + + # Memory aggregation + kv_caches = [n.kv_cache_gb for n in self.nodes if n.kv_cache_gb] + if kv_caches: + self.total_kv_cache_gb = sum(kv_caches) + + +@dataclass +class RollupResult: + """Consolidated benchmark result for a single concurrency level. + + Source: logs/*_isl_*_osl_*/result.json or benchmark.out + """ + + concurrency: int + output_tps: float + total_tps: float | None = None + request_throughput: float | None = None + request_goodput: float | None = None + request_rate: float | str | None = None + + # Mean latencies + mean_ttft_ms: float | None = None + mean_tpot_ms: float | None = None + mean_itl_ms: float | None = None + mean_e2el_ms: float | None = None + + # Median latencies + median_ttft_ms: float | None = None + median_tpot_ms: float | None = None + median_itl_ms: float | None = None + median_e2el_ms: float | None = None + + # P99 latencies + p99_ttft_ms: float | None = None + p99_tpot_ms: float | None = None + p99_itl_ms: float | None = None + p99_e2el_ms: float | None = None + + # Token counts + total_input_tokens: int | None = None + total_output_tokens: int | None = None + + # Run metadata + duration: float | None = None + completed: int | None = None + num_prompts: int | None = None + + +@dataclass +class EnvironmentConfig: + """Environment variables and engine configuration for prefill/decode/agg workers. + + Source: config.yaml (backend.{prefill,decode,aggregated}_environment) + Source: logs/trtllm_config_{prefill,decode,agg}.yaml (engine config) + """ + + # Environment variables from config.yaml + prefill_environment: dict[str, str] = field(default_factory=dict) + decode_environment: dict[str, str] = field(default_factory=dict) + aggregated_environment: dict[str, str] = field(default_factory=dict) + + # Engine config from YAML files (TRTLLM) or parsed from logs + prefill_engine_config: dict[str, Any] = field(default_factory=dict) + decode_engine_config: dict[str, Any] = field(default_factory=dict) + aggregated_engine_config: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class RollupSummary: + """Complete rollup summary for an experiment. + + Output: logs/rollup.json + + Aggregates data from: + - config.yaml (experiment configuration) + - logs/benchmark.out (benchmark command and results) + - logs/*_isl_*_osl_*/result.json (benchmark results) + - logs/{node}_{worker_type}_{worker_id}.out/err (node metrics and commands) + - logs/trtllm_config_*.yaml (engine configuration) + """ + + # Experiment identification + job_id: str + job_name: str + generated_at: str + + # Configuration + model_path: str + model_name: str + precision: str + gpu_type: str + gpus_per_node: int + backend_type: str + frontend_type: str + + # Resource allocation + is_disaggregated: bool + total_nodes: int + total_gpus: int + prefill_nodes: int | None = None + decode_nodes: int | None = None + prefill_workers: int | None = None + decode_workers: int | None = None + prefill_gpus: int | None = None + decode_gpus: int | None = None + agg_nodes: int | None = None + agg_workers: int | None = None + + # Benchmark configuration + benchmark_type: str = "" + isl: int | None = None + osl: int | None = None + concurrencies: list[int] = field(default_factory=list) + + # Aggregated results + results: list[RollupResult] = field(default_factory=list) + + # Summary statistics (computed from results) + max_output_tps: float | None = None + max_total_tps: float | None = None + min_mean_ttft_ms: float | None = None + min_mean_itl_ms: float | None = None + + # Node-level metrics + nodes_summary: NodesSummary | None = None + + # Environment and engine configuration + environment_config: EnvironmentConfig | None = None + + # Launch commands + benchmark_command: LaunchCommandRollup | None = None + + # Tags + tags: list[str] = field(default_factory=list) + + def compute_summary_stats(self) -> None: + """Compute summary statistics from results.""" + if not self.results: + return + + output_tps_values = [r.output_tps for r in self.results if r.output_tps is not None] + total_tps_values = [r.total_tps for r in self.results if r.total_tps is not None] + ttft_values = [r.mean_ttft_ms for r in self.results if r.mean_ttft_ms is not None] + itl_values = [r.mean_itl_ms for r in self.results if r.mean_itl_ms is not None] + + if output_tps_values: + self.max_output_tps = max(output_tps_values) + if total_tps_values: + self.max_total_tps = max(total_tps_values) + if ttft_values: + self.min_mean_ttft_ms = min(ttft_values) + if itl_values: + self.min_mean_itl_ms = min(itl_values) + diff --git a/src/srtctl/cli/mixins/rollup_stage.py b/src/srtctl/cli/mixins/rollup_stage.py new file mode 100644 index 00000000..325b960d --- /dev/null +++ b/src/srtctl/cli/mixins/rollup_stage.py @@ -0,0 +1,612 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Rollup stage mixin for SweepOrchestrator. + +Aggregates experiment data from multiple benchmark runs into a single consolidated summary. +Includes node-level metrics parsed from prefill/decode .out and .err files using analysis.srtlog. +""" + +from __future__ import annotations + +import json +import logging +from dataclasses import asdict +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from srtctl.cli.mixins.rollup import ( + EnvironmentConfig, + LaunchCommandRollup, + NodesSummary, + RollupResult, + RollupSummary, +) + +if TYPE_CHECKING: + from srtctl.core.runtime import RuntimeContext + from srtctl.core.schema import SrtConfig + from srtctl.core.topology import Endpoint + +logger = logging.getLogger(__name__) + + +class RollupStageMixin: + """Mixin for rollup stage that consolidates experiment data. + + Requires: + self.config: SrtConfig + self.runtime: RuntimeContext + self.endpoints: list[Endpoint] + """ + + # Type hints for mixin dependencies + config: SrtConfig + runtime: RuntimeContext + + @property + def endpoints(self) -> list[Endpoint]: + """Endpoint allocation topology.""" + ... + + def run_rollup(self, tags: list[str] | None = None) -> Path | None: + """Run the rollup stage to consolidate experiment data. + + Args: + tags: Optional list of tags for the experiment + + Returns: + Path to the generated rollup.json file, or None if rollup failed + """ + logger.info("Running rollup stage") + + try: + # Collect benchmark results + results = self._collect_benchmark_results() + + if not results: + logger.warning("No benchmark results found to rollup") + return None + + # Collect node metrics using analysis.srtlog + nodes_summary = self._collect_node_metrics() + + # Collect benchmark launch command + benchmark_command = self._collect_benchmark_command() + + # Collect environment and engine configuration + environment_config = self._collect_environment_config() + + # Build rollup summary + summary = self._build_rollup_summary(results, tags, nodes_summary, benchmark_command, environment_config) + + # Write rollup.json + rollup_path = self.runtime.log_dir / "rollup.json" + self._write_rollup(summary, rollup_path) + + logger.info("Rollup complete: %s", rollup_path) + logger.info( + "Summary: %d results, max output TPS: %.2f, %d nodes", + len(summary.results), + summary.max_output_tps or 0, + len(nodes_summary.nodes) if nodes_summary else 0, + ) + + return rollup_path + + except Exception as e: + logger.error("Rollup failed: %s", e) + return None + + def _collect_benchmark_results(self) -> list[dict[str, Any]]: + """Collect all benchmark result JSON files from the log directory. + + Uses the appropriate benchmark parser based on config.benchmark.type. + + Returns: + List of parsed benchmark result dicts + """ + results = [] + benchmark_type = self.config.benchmark.type + + try: + from analysis.srtlog.parsers import get_benchmark_parser, list_benchmark_parsers + + # Get the appropriate parser + try: + parser = get_benchmark_parser(benchmark_type) + logger.debug("Using %s benchmark parser", benchmark_type) + except ValueError: + logger.warning( + "No parser for benchmark type '%s', available: %s. Using fallback.", + benchmark_type, + list_benchmark_parsers(), + ) + parser = None + + # Try parser-specific result collection first + if parser is not None: + # For mooncake-router, look for AIPerf results + if hasattr(parser, "find_aiperf_results"): + aiperf_files = parser.find_aiperf_results(self.runtime.log_dir) + for aiperf_path in aiperf_files: + result = parser.parse_result_json(aiperf_path) + if result.get("output_tps") is not None: + results.append(result) + logger.debug("Loaded AIPerf result: %s", aiperf_path) + + # For sa-bench style, look for result directories + if hasattr(parser, "parse_result_directory"): + for entry in self.runtime.log_dir.iterdir(): + if not entry.is_dir(): + continue + # Match patterns like sa-bench_isl_X_osl_Y + if "_isl_" in entry.name and "_osl_" in entry.name: + logger.debug("Found benchmark results directory: %s", entry.name) + dir_results = parser.parse_result_directory(entry) + results.extend(dir_results) + + except ImportError: + logger.debug("analysis.srtlog.parsers not available, using fallback") + parser = None + + # Fallback: direct JSON parsing + if not results: + for entry in self.runtime.log_dir.iterdir(): + if not entry.is_dir(): + continue + + # Match patterns like sa-bench_isl_X_osl_Y, vllm_isl_X_osl_Y + if "_isl_" in entry.name and "_osl_" in entry.name: + logger.debug("Found benchmark results directory: %s", entry.name) + + # Parse all JSON files in the directory + for json_file in entry.glob("*.json"): + try: + with open(json_file) as f: + data = json.load(f) + results.append(data) + logger.debug("Loaded result: %s", json_file.name) + except Exception as e: + logger.warning("Failed to parse %s: %s", json_file, e) + + # Sort by concurrency + results.sort(key=lambda x: x.get("max_concurrency", 0) or 0) + + logger.info("Collected %d benchmark results", len(results)) + return results + + def _collect_node_metrics(self) -> NodesSummary | None: + """Collect node metrics from prefill/decode log files. + + Uses the appropriate node parser based on config.backend_type. + Falls back through parser versions if needed (e.g., sglang -> sglang-v2). + + Returns: + NodesSummary with aggregated node statistics, or None if parsing fails + """ + backend_type = self.config.backend_type + log_dir = self.runtime.log_dir + + try: + from analysis.srtlog.parsers import get_node_parser + + # Try parsers in order of preference + parser_order = self._get_parser_order(backend_type) + logger.debug("Parser order for %s: %s", backend_type, parser_order) + + nodes = [] + used_parser = None + parser = None + + for parser_type in parser_order: + try: + parser = get_node_parser(parser_type) + nodes = parser.parse_logs(log_dir) + + # Check if we got meaningful results (batches or config) + total_batches = sum(len(n.batches) for n in nodes) + has_config = any(n.config for n in nodes) + if total_batches > 0 or has_config: + used_parser = parser_type + logger.info("Using %s parser: found %d nodes with %d batches", parser_type, len(nodes), total_batches) + break + else: + logger.debug("%s parser found no batches, trying next", parser_type) + + except ValueError: + logger.debug("Parser %s not available", parser_type) + continue + + if not nodes: + logger.warning("No node metrics found in %s with any parser", log_dir) + return None + + # Build summary from parsed nodes + summary = NodesSummary.from_node_metrics_list(nodes) + + # Parse launch commands for each node + if parser is not None and hasattr(parser, "parse_launch_command"): + self._add_launch_commands_to_summary(summary, parser, log_dir) + + if summary.total_agg_nodes > 0: + logger.info("Node summary (%s): %d agg nodes", used_parser, summary.total_agg_nodes) + else: + logger.info( + "Node summary (%s): %d prefill, %d decode nodes", + used_parser, + summary.total_prefill_nodes, + summary.total_decode_nodes, + ) + + return summary + + except ImportError: + logger.warning("analysis.srtlog.parsers not available, skipping node metrics") + return None + except Exception as e: + logger.warning("Failed to collect node metrics: %s", e) + return None + + def _add_launch_commands_to_summary(self, summary: NodesSummary, parser: Any, log_dir: Path) -> None: + """Parse and add launch commands to each node in the summary. + + Args: + summary: NodesSummary to update + parser: Node parser with parse_launch_command method + log_dir: Directory containing log files + """ + for node_rollup in summary.nodes: + # Find the log file for this node + node_name = node_rollup.node_name + worker_type = node_rollup.worker_type + worker_id = node_rollup.worker_id + + # Try both .out and .err files + for ext in [".out", ".err"]: + log_file = log_dir / f"{node_name}_{worker_type}_{worker_id}{ext}" + if log_file.exists(): + try: + content = log_file.read_text(errors="replace") + cmd = parser.parse_launch_command(content, worker_type=worker_type) + if cmd: + args = cmd.extra_args + node_rollup.launch_command = LaunchCommandRollup( + raw_command=cmd.raw_command, + command_type="worker", + model_path=args.get("model_path"), + served_model_name=args.get("served_model_name"), + worker_type=worker_type, + backend_type=cmd.backend_type, + disaggregation_mode=args.get("disaggregation_mode"), + tp_size=args.get("tp_size"), + pp_size=args.get("pp_size"), + dp_size=args.get("dp_size"), + ep_size=args.get("ep_size"), + port=args.get("port"), + max_num_seqs=args.get("max_num_seqs"), + max_model_len=args.get("max_model_len"), + ) + logger.debug("Parsed launch command for %s_%s_%s", node_name, worker_type, worker_id) + break + except Exception as e: + logger.debug("Failed to parse launch command from %s: %s", log_file, e) + + def _collect_benchmark_command(self) -> LaunchCommandRollup | None: + """Parse the benchmark launch command from benchmark.out. + + Returns: + LaunchCommandRollup with benchmark parameters, or None if not found + """ + benchmark_type = self.config.benchmark.type + log_dir = self.runtime.log_dir + + try: + from analysis.srtlog.parsers import get_benchmark_parser + + parser = get_benchmark_parser(benchmark_type) + + # Look for benchmark.out file + benchmark_out = log_dir / "benchmark.out" + if not benchmark_out.exists(): + logger.debug("benchmark.out not found in %s", log_dir) + return None + + content = benchmark_out.read_text(errors="replace") + cmd = parser.parse_launch_command(content) + + if cmd: + args = cmd.extra_args + return LaunchCommandRollup( + raw_command=cmd.raw_command, + command_type="benchmark", + model_path=args.get("model"), + benchmark_type=cmd.benchmark_type, + base_url=args.get("base_url"), + max_concurrency=args.get("max_concurrency"), + num_prompts=args.get("num_prompts"), + input_len=args.get("input_len"), + output_len=args.get("output_len"), + ) + + except ImportError: + logger.debug("analysis.srtlog.parsers not available") + except ValueError as e: + logger.debug("No benchmark parser for %s: %s", benchmark_type, e) + except Exception as e: + logger.debug("Failed to parse benchmark command: %s", e) + + return None + + def _collect_environment_config(self) -> EnvironmentConfig | None: + """Collect environment variables and engine config from config files. + + Parses: + 1. config.yaml for prefill_environment and decode_environment + 2. YAML config files (e.g., trtllm_config_prefill.yaml) for engine settings + + Returns: + EnvironmentConfig with environment variables and engine config, or None if not found + """ + log_dir = self.runtime.log_dir + + try: + import yaml + except ImportError: + logger.debug("PyYAML not available, skipping environment config collection") + return None + + config = EnvironmentConfig() + + # Try to find config.yaml in the job output directory + # It could be in log_dir, log_dir.parent, or a sibling directory + config_paths = [ + log_dir / "config.yaml", + log_dir.parent / "config.yaml", + log_dir.parent.parent / "config.yaml", + ] + + config_yaml = None + for path in config_paths: + if path.exists(): + config_yaml = path + break + + if config_yaml: + try: + with open(config_yaml) as f: + job_config = yaml.safe_load(f) + + backend_section = job_config.get("backend", {}) + + # Extract environment variables + if "prefill_environment" in backend_section: + config.prefill_environment = backend_section["prefill_environment"] + logger.debug("Found prefill_environment with %d vars", len(config.prefill_environment)) + + if "decode_environment" in backend_section: + config.decode_environment = backend_section["decode_environment"] + logger.debug("Found decode_environment with %d vars", len(config.decode_environment)) + + if "aggregated_environment" in backend_section: + config.aggregated_environment = backend_section["aggregated_environment"] + logger.debug("Found aggregated_environment with %d vars", len(config.aggregated_environment)) + + # For TRTLLM, also extract inline engine config + if "trtllm_config" in backend_section: + trtllm_config = backend_section["trtllm_config"] + if "prefill" in trtllm_config: + config.prefill_engine_config = trtllm_config["prefill"] + if "decode" in trtllm_config: + config.decode_engine_config = trtllm_config["decode"] + if "aggregated" in trtllm_config: + config.aggregated_engine_config = trtllm_config["aggregated"] + + # For SGLang, extract sglang_config if present + if "sglang_config" in backend_section: + sglang_config = backend_section["sglang_config"] + if "prefill" in sglang_config: + config.prefill_engine_config = sglang_config["prefill"] + if "decode" in sglang_config: + config.decode_engine_config = sglang_config["decode"] + if "aggregated" in sglang_config: + config.aggregated_engine_config = sglang_config["aggregated"] + + except Exception as e: + logger.debug("Failed to parse config.yaml: %s", e) + + # Also look for separate YAML config files (e.g., trtllm_config_prefill.yaml) + prefill_yaml = log_dir / "trtllm_config_prefill.yaml" + decode_yaml = log_dir / "trtllm_config_decode.yaml" + + if prefill_yaml.exists() and not config.prefill_engine_config: + try: + with open(prefill_yaml) as f: + config.prefill_engine_config = yaml.safe_load(f) + logger.debug("Loaded prefill engine config from %s", prefill_yaml) + except Exception as e: + logger.debug("Failed to parse %s: %s", prefill_yaml, e) + + if decode_yaml.exists() and not config.decode_engine_config: + try: + with open(decode_yaml) as f: + config.decode_engine_config = yaml.safe_load(f) + logger.debug("Loaded decode engine config from %s", decode_yaml) + except Exception as e: + logger.debug("Failed to parse %s: %s", decode_yaml, e) + + # Return None if we didn't find anything + if not any([ + config.prefill_environment, + config.decode_environment, + config.aggregated_environment, + config.prefill_engine_config, + config.decode_engine_config, + config.aggregated_engine_config, + ]): + logger.debug("No environment or engine config found") + return None + + # Log what we found + env_counts = [] + if config.prefill_environment: + env_counts.append(f"{len(config.prefill_environment)} prefill") + if config.decode_environment: + env_counts.append(f"{len(config.decode_environment)} decode") + if config.aggregated_environment: + env_counts.append(f"{len(config.aggregated_environment)} agg") + + if env_counts: + logger.info("Collected environment vars: %s", ", ".join(env_counts)) + + return config + + def _get_parser_order(self, backend_type: str) -> list[str]: + """Get the order of parsers to try for a given backend type. + + Args: + backend_type: Backend type from config (e.g., "sglang", "trtllm") + + Returns: + List of parser types to try in order + """ + parser_orders = { + "sglang": ["sglang"], + "trtllm": ["trtllm"], + } + + return parser_orders.get(backend_type, [backend_type]) + + def _build_rollup_summary( + self, + results: list[dict[str, Any]], + tags: list[str] | None = None, + nodes_summary: NodesSummary | None = None, + benchmark_command: LaunchCommandRollup | None = None, + environment_config: EnvironmentConfig | None = None, + ) -> RollupSummary: + """Build a RollupSummary from collected results. + + Args: + results: List of parsed benchmark result dicts + tags: Optional tags for the experiment + nodes_summary: Optional node-level metrics summary + benchmark_command: Optional parsed benchmark launch command + environment_config: Optional environment and engine configuration + + Returns: + RollupSummary instance + """ + r = self.config.resources + b = self.config.benchmark + + # Determine topology + is_disaggregated = r.is_disaggregated + + if is_disaggregated: + total_gpus = r.prefill_gpus + r.decode_gpus + else: + total_gpus = (r.agg_nodes or 1) * r.gpus_per_node + + # Build summary + summary = RollupSummary( + # Identification + job_id=self.runtime.job_id, + job_name=self.config.name, + generated_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + # Model config + model_path=str(self.runtime.model_path), + model_name=self.config.served_model_name, + precision=self.config.model.precision, + gpu_type=r.gpu_type, + gpus_per_node=r.gpus_per_node, + backend_type=self.config.backend_type, + frontend_type=self.config.frontend.type, + # Resource allocation + is_disaggregated=is_disaggregated, + total_nodes=r.total_nodes, + total_gpus=total_gpus, + # Benchmark config + benchmark_type=b.type, + isl=b.isl, + osl=b.osl, + concurrencies=b.get_concurrency_list(), + # Node metrics + nodes_summary=nodes_summary, + # Environment and engine configuration + environment_config=environment_config, + # Launch commands + benchmark_command=benchmark_command, + # Tags + tags=tags or [], + ) + + # Add disaggregated-specific fields + if is_disaggregated: + summary.prefill_nodes = r.prefill_nodes + summary.decode_nodes = r.decode_nodes + summary.prefill_workers = r.num_prefill + summary.decode_workers = r.num_decode + summary.prefill_gpus = r.prefill_gpus + summary.decode_gpus = r.decode_gpus + else: + summary.agg_nodes = r.agg_nodes + summary.agg_workers = r.num_agg + + # Convert results to RollupResult objects + for data in results: + result = RollupResult( + concurrency=data.get("max_concurrency", 0), + output_tps=data.get("output_throughput", 0), + total_tps=data.get("total_token_throughput"), + request_throughput=data.get("request_throughput"), + request_goodput=data.get("request_goodput"), + request_rate=data.get("request_rate"), + # Mean latencies + mean_ttft_ms=data.get("mean_ttft_ms"), + mean_tpot_ms=data.get("mean_tpot_ms"), + mean_itl_ms=data.get("mean_itl_ms"), + mean_e2el_ms=data.get("mean_e2el_ms"), + # Median latencies + median_ttft_ms=data.get("median_ttft_ms"), + median_tpot_ms=data.get("median_tpot_ms"), + median_itl_ms=data.get("median_itl_ms"), + median_e2el_ms=data.get("median_e2el_ms"), + # P99 latencies + p99_ttft_ms=data.get("p99_ttft_ms"), + p99_tpot_ms=data.get("p99_tpot_ms"), + p99_itl_ms=data.get("p99_itl_ms"), + p99_e2el_ms=data.get("p99_e2el_ms"), + # Token counts + total_input_tokens=data.get("total_input_tokens"), + total_output_tokens=data.get("total_output_tokens"), + # Metadata + duration=data.get("duration"), + completed=data.get("completed"), + num_prompts=data.get("num_prompts"), + ) + summary.results.append(result) + + # Compute summary statistics + summary.compute_summary_stats() + + return summary + + def _write_rollup(self, summary: RollupSummary, path: Path) -> None: + """Write rollup summary to JSON file. + + Args: + summary: RollupSummary to write + path: Output file path + """ + # Convert to dict, handling nested dataclasses + data = asdict(summary) + + # Write with nice formatting + with open(path, "w") as f: + json.dump(data, f, indent=2, default=str) + + logger.debug("Wrote rollup to %s", path) + diff --git a/tests/test_configs.py b/tests/test_configs.py index 07bce17f..0097d24b 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -383,3 +383,320 @@ def test_setup_script_env_var_override(self, monkeypatch): config = replace(config, setup_script=setup_script_override) assert config.setup_script == "install-sglang-main.sh" + + +class TestOutputDirectoryStructure: + """Tests for output directory structure created during job submission.""" + + def test_output_directory_created_with_job_id(self, tmp_path, monkeypatch): + """Test that outputs/{job_id}/ directory is created on successful submission.""" + import json + import subprocess + from unittest.mock import MagicMock, patch + + from srtctl.cli.submit import submit_with_orchestrator + from srtctl.core.schema import ( + ModelConfig, + ResourceConfig, + SrtConfig, + ) + + # Create a test config + config = SrtConfig( + name="test-output-dir", + model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"), + resources=ResourceConfig(gpu_type="h100", gpus_per_node=8, agg_nodes=1), + ) + + # Create a temp config file + config_file = tmp_path / "config.yaml" + config_file.write_text("name: test") + + # Mock srtctl_root to use temp directory + monkeypatch.setattr( + "srtctl.cli.submit.get_srtslurm_setting", + lambda key, default=None: str(tmp_path) if key == "srtctl_root" else default, + ) + + # Mock sbatch to return a fake job ID + mock_result = MagicMock() + mock_result.stdout = "Submitted batch job 12345" + mock_result.returncode = 0 + + with patch("subprocess.run", return_value=mock_result): + submit_with_orchestrator( + config_path=config_file, + config=config, + dry_run=False, + ) + + # Verify directory structure + output_dir = tmp_path / "outputs" / "12345" + assert output_dir.exists(), "outputs/{job_id}/ directory should be created" + assert output_dir.is_dir(), "outputs/{job_id}/ should be a directory" + + def test_config_yaml_copied_to_output_dir(self, tmp_path, monkeypatch): + """Test that config.yaml is copied to outputs/{job_id}/.""" + from unittest.mock import MagicMock, patch + + from srtctl.cli.submit import submit_with_orchestrator + from srtctl.core.schema import ( + ModelConfig, + ResourceConfig, + SrtConfig, + ) + + config = SrtConfig( + name="test-config-copy", + model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"), + resources=ResourceConfig(gpu_type="h100", gpus_per_node=8, agg_nodes=1), + ) + + # Create config file with specific content + config_file = tmp_path / "my_config.yaml" + config_content = "name: test-config-copy\nmodel:\n path: /model" + config_file.write_text(config_content) + + monkeypatch.setattr( + "srtctl.cli.submit.get_srtslurm_setting", + lambda key, default=None: str(tmp_path) if key == "srtctl_root" else default, + ) + + mock_result = MagicMock() + mock_result.stdout = "Submitted batch job 99999" + mock_result.returncode = 0 + + with patch("subprocess.run", return_value=mock_result): + submit_with_orchestrator(config_path=config_file, config=config, dry_run=False) + + # Verify config.yaml was copied + copied_config = tmp_path / "outputs" / "99999" / "config.yaml" + assert copied_config.exists(), "config.yaml should be copied to output dir" + assert copied_config.read_text() == config_content, "config.yaml content should match original" + + def test_sbatch_script_copied_to_output_dir(self, tmp_path, monkeypatch): + """Test that sbatch_script.sh is copied to outputs/{job_id}/.""" + from unittest.mock import MagicMock, patch + + from srtctl.cli.submit import submit_with_orchestrator + from srtctl.core.schema import ( + ModelConfig, + ResourceConfig, + SrtConfig, + ) + + config = SrtConfig( + name="test-sbatch-copy", + model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"), + resources=ResourceConfig(gpu_type="h100", gpus_per_node=8, agg_nodes=1), + ) + + config_file = tmp_path / "config.yaml" + config_file.write_text("name: test") + + monkeypatch.setattr( + "srtctl.cli.submit.get_srtslurm_setting", + lambda key, default=None: str(tmp_path) if key == "srtctl_root" else default, + ) + + mock_result = MagicMock() + mock_result.stdout = "Submitted batch job 88888" + mock_result.returncode = 0 + + with patch("subprocess.run", return_value=mock_result): + submit_with_orchestrator(config_path=config_file, config=config, dry_run=False) + + # Verify sbatch_script.sh was copied + sbatch_script = tmp_path / "outputs" / "88888" / "sbatch_script.sh" + assert sbatch_script.exists(), "sbatch_script.sh should be copied to output dir" + # Verify it's a valid sbatch script + content = sbatch_script.read_text() + assert "#!/bin/bash" in content, "sbatch script should have bash shebang" + assert "#SBATCH" in content, "sbatch script should have SBATCH directives" + + def test_metadata_json_created_in_output_dir(self, tmp_path, monkeypatch): + """Test that {job_id}.json metadata file is created in outputs/{job_id}/.""" + import json + from unittest.mock import MagicMock, patch + + from srtctl.cli.submit import submit_with_orchestrator + from srtctl.core.schema import ( + ModelConfig, + ResourceConfig, + SrtConfig, + ) + + config = SrtConfig( + name="test-metadata", + model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"), + resources=ResourceConfig( + gpu_type="h100", + gpus_per_node=8, + prefill_nodes=1, + decode_nodes=2, + prefill_workers=1, + decode_workers=4, + ), + ) + + config_file = tmp_path / "config.yaml" + config_file.write_text("name: test") + + monkeypatch.setattr( + "srtctl.cli.submit.get_srtslurm_setting", + lambda key, default=None: str(tmp_path) if key == "srtctl_root" else default, + ) + + mock_result = MagicMock() + mock_result.stdout = "Submitted batch job 77777" + mock_result.returncode = 0 + + with patch("subprocess.run", return_value=mock_result): + submit_with_orchestrator(config_path=config_file, config=config, dry_run=False) + + # Verify {job_id}.json was created + metadata_file = tmp_path / "outputs" / "77777" / "77777.json" + assert metadata_file.exists(), "{job_id}.json should be created in output dir" + + # Verify metadata content + metadata = json.loads(metadata_file.read_text()) + assert metadata["version"] == "2.0" + assert metadata["orchestrator"] is True + assert metadata["job_id"] == "77777" + assert metadata["job_name"] == "test-metadata" + assert metadata["model"]["path"] == "/model" + assert metadata["model"]["container"] == "/container.sqsh" + assert metadata["model"]["precision"] == "fp8" + assert metadata["resources"]["gpu_type"] == "h100" + assert metadata["resources"]["prefill_nodes"] == 1 + assert metadata["resources"]["decode_nodes"] == 2 + assert metadata["resources"]["prefill_workers"] == 1 + assert metadata["resources"]["decode_workers"] == 4 + + def test_tags_included_in_metadata(self, tmp_path, monkeypatch): + """Test that tags are included in metadata when provided.""" + import json + from unittest.mock import MagicMock, patch + + from srtctl.cli.submit import submit_with_orchestrator + from srtctl.core.schema import ( + ModelConfig, + ResourceConfig, + SrtConfig, + ) + + config = SrtConfig( + name="test-tags", + model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"), + resources=ResourceConfig(gpu_type="h100", gpus_per_node=8, agg_nodes=1), + ) + + config_file = tmp_path / "config.yaml" + config_file.write_text("name: test") + + monkeypatch.setattr( + "srtctl.cli.submit.get_srtslurm_setting", + lambda key, default=None: str(tmp_path) if key == "srtctl_root" else default, + ) + + mock_result = MagicMock() + mock_result.stdout = "Submitted batch job 66666" + mock_result.returncode = 0 + + with patch("subprocess.run", return_value=mock_result): + submit_with_orchestrator( + config_path=config_file, + config=config, + dry_run=False, + tags=["experiment", "baseline", "v2"], + ) + + metadata_file = tmp_path / "outputs" / "66666" / "66666.json" + metadata = json.loads(metadata_file.read_text()) + assert metadata["tags"] == ["experiment", "baseline", "v2"] + + def test_complete_output_directory_structure(self, tmp_path, monkeypatch): + """Test that complete output directory structure is preserved.""" + import json + from unittest.mock import MagicMock, patch + + from srtctl.cli.submit import submit_with_orchestrator + from srtctl.core.schema import ( + ModelConfig, + ResourceConfig, + SrtConfig, + ) + + config = SrtConfig( + name="test-complete-structure", + model=ModelConfig(path="/model", container="/container.sqsh", precision="fp4"), + resources=ResourceConfig(gpu_type="gb200", gpus_per_node=4, agg_nodes=2, agg_workers=2), + setup_script="my-setup.sh", + ) + + config_file = tmp_path / "config.yaml" + config_file.write_text("name: test-complete-structure") + + monkeypatch.setattr( + "srtctl.cli.submit.get_srtslurm_setting", + lambda key, default=None: str(tmp_path) if key == "srtctl_root" else default, + ) + + mock_result = MagicMock() + mock_result.stdout = "Submitted batch job 55555" + mock_result.returncode = 0 + + with patch("subprocess.run", return_value=mock_result): + submit_with_orchestrator( + config_path=config_file, + config=config, + dry_run=False, + tags=["production"], + ) + + output_dir = tmp_path / "outputs" / "55555" + + # Verify all expected files exist + expected_files = [ + output_dir / "config.yaml", + output_dir / "sbatch_script.sh", + output_dir / "55555.json", + ] + for expected_file in expected_files: + assert expected_file.exists(), f"{expected_file.name} should exist in output dir" + + # Verify metadata includes setup_script + metadata = json.loads((output_dir / "55555.json").read_text()) + assert metadata["setup_script"] == "my-setup.sh" + assert metadata["tags"] == ["production"] + assert metadata["resources"]["agg_workers"] == 2 + + def test_dry_run_does_not_create_output_dir(self, tmp_path, monkeypatch): + """Test that dry-run mode does NOT create output directory.""" + from srtctl.cli.submit import submit_with_orchestrator + from srtctl.core.schema import ( + ModelConfig, + ResourceConfig, + SrtConfig, + ) + + config = SrtConfig( + name="test-dry-run", + model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"), + resources=ResourceConfig(gpu_type="h100", gpus_per_node=8, agg_nodes=1), + ) + + config_file = tmp_path / "config.yaml" + config_file.write_text("name: test") + + monkeypatch.setattr( + "srtctl.cli.submit.get_srtslurm_setting", + lambda key, default=None: str(tmp_path) if key == "srtctl_root" else default, + ) + + # Dry run should not call sbatch or create output dir + submit_with_orchestrator(config_path=config_file, config=config, dry_run=True) + + # Verify no output directory was created + outputs_dir = tmp_path / "outputs" + assert not outputs_dir.exists(), "outputs/ should not be created in dry-run mode" diff --git a/tests/test_rollup.py b/tests/test_rollup.py new file mode 100644 index 00000000..de315cbf --- /dev/null +++ b/tests/test_rollup.py @@ -0,0 +1,938 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the rollup stage mixin.""" + +import json +from pathlib import Path + +import pytest + +from srtctl.cli.mixins.rollup_stage import ( + NodeRollup, + NodesSummary, + RollupResult, + RollupStageMixin, + RollupSummary, +) + + +class TestNodeRollup: + """Tests for NodeRollup dataclass.""" + + def test_minimal_node_rollup(self): + """Test creating a NodeRollup with minimal fields.""" + node = NodeRollup( + node_name="node-01", + worker_type="prefill", + worker_id="w0", + ) + assert node.node_name == "node-01" + assert node.worker_type == "prefill" + assert node.worker_id == "w0" + assert node.total_batches == 0 + assert node.tp_size is None + + def test_prefill_node_rollup(self): + """Test creating a prefill NodeRollup with all metrics.""" + node = NodeRollup( + node_name="node-01", + worker_type="prefill", + worker_id="w0", + tp_size=8, + dp_size=1, + ep_size=1, + avail_mem_gb=75.0, + mem_usage_gb=107.0, + kv_cache_gb=17.16, + kv_tokens=524288, + total_batches=100, + total_new_tokens=50000, + total_cached_tokens=10000, + cache_hit_rate=16.67, + avg_input_throughput=5000.0, + max_input_throughput=8000.0, + max_queue_requests=5, + max_inflight_requests=10, + ) + assert node.tp_size == 8 + assert node.kv_cache_gb == 17.16 + assert node.total_new_tokens == 50000 + assert node.cache_hit_rate == 16.67 + + def test_decode_node_rollup(self): + """Test creating a decode NodeRollup with all metrics.""" + node = NodeRollup( + node_name="node-02", + worker_type="decode", + worker_id="w0", + tp_size=8, + total_batches=500, + avg_running_requests=50.0, + max_running_requests=100, + avg_gen_throughput=150.0, + max_gen_throughput=200.0, + max_queue_requests=10, + max_transfer_requests=5, + ) + assert node.worker_type == "decode" + assert node.avg_gen_throughput == 150.0 + assert node.max_running_requests == 100 + + def test_agg_node_rollup(self): + """Test creating an agg NodeRollup with both prefill and decode metrics.""" + node = NodeRollup( + node_name="node-03", + worker_type="agg", + worker_id="w0", + tp_size=8, + total_batches=600, + total_prefill_batches=100, + total_decode_batches=500, + # Prefill stats + total_new_tokens=50000, + total_cached_tokens=10000, + cache_hit_rate=16.67, + avg_input_throughput=5000.0, + max_input_throughput=8000.0, + # Decode stats + avg_running_requests=50.0, + max_running_requests=100, + avg_gen_throughput=150.0, + max_gen_throughput=200.0, + ) + assert node.worker_type == "agg" + assert node.is_agg is True + assert node.total_prefill_batches == 100 + assert node.total_decode_batches == 500 + assert node.avg_input_throughput == 5000.0 + assert node.avg_gen_throughput == 150.0 + + def test_from_node_metrics(self): + """Test creating NodeRollup from NodeMetrics.""" + from analysis.srtlog.models import BatchMetrics, MemoryMetrics, NodeMetrics + + # Create a mock NodeMetrics with prefill batches + node_metrics = NodeMetrics( + node_info={"node": "test-node", "worker_type": "prefill", "worker_id": "w0"}, + batches=[ + BatchMetrics( + timestamp="2025-01-22 10:00:00", + dp=0, + tp=0, + ep=0, + batch_type="prefill", + new_token=1000, + cached_token=200, + input_throughput=5000.0, + queue_req=2, + inflight_req=5, + ), + BatchMetrics( + timestamp="2025-01-22 10:00:01", + dp=0, + tp=0, + ep=0, + batch_type="prefill", + new_token=1500, + cached_token=300, + input_throughput=6000.0, + queue_req=3, + inflight_req=8, + ), + ], + memory_snapshots=[ + MemoryMetrics( + timestamp="2025-01-22 10:00:00", + dp=0, + tp=0, + ep=0, + metric_type="memory", + avail_mem_gb=75.0, + mem_usage_gb=107.0, + kv_cache_gb=17.16, + kv_tokens=524288, + ), + ], + config={"tp_size": 8, "dp_size": 1, "ep_size": 1}, + ) + + rollup = NodeRollup.from_node_metrics(node_metrics) + + assert rollup.node_name == "test-node" + assert rollup.worker_type == "prefill" + assert rollup.tp_size == 8 + assert rollup.total_batches == 2 + assert rollup.total_new_tokens == 2500 # 1000 + 1500 + assert rollup.total_cached_tokens == 500 # 200 + 300 + assert rollup.avg_input_throughput == 5500.0 # (5000 + 6000) / 2 + assert rollup.max_input_throughput == 6000.0 + assert rollup.max_queue_requests == 3 + assert rollup.max_inflight_requests == 8 + assert rollup.kv_cache_gb == 17.16 + + # Check cache hit rate: 500 / (2500 + 500) = 16.67% + assert rollup.cache_hit_rate == pytest.approx(16.67, rel=0.01) + + def test_from_node_metrics_agg_worker(self): + """Test creating NodeRollup from agg worker NodeMetrics.""" + from analysis.srtlog.models import BatchMetrics, MemoryMetrics, NodeMetrics + + # Create a mock NodeMetrics with agg worker (has both prefill and decode batches) + node_metrics = NodeMetrics( + node_info={"node": "agg-node", "worker_type": "agg", "worker_id": "w0"}, + batches=[ + # Prefill batches + BatchMetrics( + timestamp="2025-01-22 10:00:00", + dp=0, + tp=0, + ep=0, + batch_type="prefill", + new_token=1000, + cached_token=200, + input_throughput=5000.0, + inflight_req=5, + ), + BatchMetrics( + timestamp="2025-01-22 10:00:01", + dp=0, + tp=0, + ep=0, + batch_type="prefill", + new_token=1500, + cached_token=300, + input_throughput=6000.0, + inflight_req=8, + ), + # Decode batches + BatchMetrics( + timestamp="2025-01-22 10:00:02", + dp=0, + tp=0, + ep=0, + batch_type="decode", + running_req=50, + gen_throughput=150.0, + queue_req=3, + ), + BatchMetrics( + timestamp="2025-01-22 10:00:03", + dp=0, + tp=0, + ep=0, + batch_type="decode", + running_req=60, + gen_throughput=180.0, + queue_req=5, + ), + ], + memory_snapshots=[ + MemoryMetrics( + timestamp="2025-01-22 10:00:00", + dp=0, + tp=0, + ep=0, + metric_type="memory", + kv_cache_gb=20.0, + ), + ], + config={"tp_size": 8}, + ) + + rollup = NodeRollup.from_node_metrics(node_metrics) + + assert rollup.node_name == "agg-node" + assert rollup.worker_type == "agg" + assert rollup.is_agg is True + assert rollup.total_batches == 4 + assert rollup.total_prefill_batches == 2 + assert rollup.total_decode_batches == 2 + + # Prefill stats + assert rollup.total_new_tokens == 2500 # 1000 + 1500 + assert rollup.total_cached_tokens == 500 # 200 + 300 + assert rollup.avg_input_throughput == 5500.0 # (5000 + 6000) / 2 + assert rollup.max_input_throughput == 6000.0 + assert rollup.max_inflight_requests == 8 + + # Decode stats + assert rollup.avg_running_requests == 55.0 # (50 + 60) / 2 + assert rollup.max_running_requests == 60 + assert rollup.avg_gen_throughput == 165.0 # (150 + 180) / 2 + assert rollup.max_gen_throughput == 180.0 + assert rollup.max_queue_requests == 5 + + +class TestNodesSummary: + """Tests for NodesSummary dataclass.""" + + def test_empty_summary(self): + """Test creating an empty NodesSummary.""" + summary = NodesSummary() + assert summary.total_prefill_nodes == 0 + assert summary.total_decode_nodes == 0 + assert summary.nodes == [] + + def test_from_node_metrics_list(self): + """Test creating NodesSummary from NodeMetrics list.""" + from analysis.srtlog.models import BatchMetrics, NodeMetrics + + nodes = [ + NodeMetrics( + node_info={"node": "node-01", "worker_type": "prefill", "worker_id": "w0"}, + batches=[ + BatchMetrics( + timestamp="2025-01-22 10:00:00", + dp=0, + tp=0, + ep=0, + batch_type="prefill", + new_token=1000, + cached_token=200, + input_throughput=5000.0, + ), + ], + config={"tp_size": 8}, + ), + NodeMetrics( + node_info={"node": "node-02", "worker_type": "decode", "worker_id": "w0"}, + batches=[ + BatchMetrics( + timestamp="2025-01-22 10:00:00", + dp=0, + tp=0, + ep=0, + batch_type="decode", + running_req=50, + gen_throughput=150.0, + ), + ], + config={"tp_size": 8}, + ), + NodeMetrics( + node_info={"node": "node-03", "worker_type": "decode", "worker_id": "w0"}, + batches=[ + BatchMetrics( + timestamp="2025-01-22 10:00:00", + dp=0, + tp=0, + ep=0, + batch_type="decode", + running_req=60, + gen_throughput=180.0, + ), + ], + config={"tp_size": 8}, + ), + ] + + summary = NodesSummary.from_node_metrics_list(nodes) + + assert summary.total_prefill_nodes == 1 + assert summary.total_decode_nodes == 2 + assert len(summary.nodes) == 3 + assert summary.total_prefill_tokens == 1000 + assert summary.total_cached_tokens == 200 + assert summary.avg_prefill_input_throughput == 5000.0 + assert summary.avg_decode_gen_throughput == 165.0 # (150 + 180) / 2 + assert summary.max_decode_gen_throughput == 180.0 + + def test_from_node_metrics_list_with_agg(self): + """Test creating NodesSummary from NodeMetrics list including agg workers.""" + from analysis.srtlog.models import BatchMetrics, NodeMetrics + + nodes = [ + # One agg worker + NodeMetrics( + node_info={"node": "agg-node-01", "worker_type": "agg", "worker_id": "w0"}, + batches=[ + BatchMetrics( + timestamp="2025-01-22 10:00:00", + dp=0, + tp=0, + ep=0, + batch_type="prefill", + new_token=1000, + cached_token=200, + input_throughput=5000.0, + ), + BatchMetrics( + timestamp="2025-01-22 10:00:01", + dp=0, + tp=0, + ep=0, + batch_type="decode", + running_req=50, + gen_throughput=150.0, + ), + ], + config={"tp_size": 8}, + ), + # Another agg worker + NodeMetrics( + node_info={"node": "agg-node-02", "worker_type": "agg", "worker_id": "w0"}, + batches=[ + BatchMetrics( + timestamp="2025-01-22 10:00:00", + dp=0, + tp=0, + ep=0, + batch_type="prefill", + new_token=1500, + cached_token=300, + input_throughput=6000.0, + ), + BatchMetrics( + timestamp="2025-01-22 10:00:01", + dp=0, + tp=0, + ep=0, + batch_type="decode", + running_req=60, + gen_throughput=180.0, + ), + ], + config={"tp_size": 8}, + ), + ] + + summary = NodesSummary.from_node_metrics_list(nodes) + + # Check counts + assert summary.total_prefill_nodes == 0 + assert summary.total_decode_nodes == 0 + assert summary.total_agg_nodes == 2 + assert len(summary.nodes) == 2 + + # Aggregated stats should include agg nodes + assert summary.total_prefill_tokens == 2500 # 1000 + 1500 + assert summary.total_cached_tokens == 500 # 200 + 300 + assert summary.avg_prefill_input_throughput == 5500.0 # (5000 + 6000) / 2 + assert summary.max_prefill_input_throughput == 6000.0 + assert summary.avg_decode_gen_throughput == 165.0 # (150 + 180) / 2 + assert summary.max_decode_gen_throughput == 180.0 + + +class TestRollupResult: + """Tests for RollupResult dataclass.""" + + def test_minimal_result(self): + """Test creating a result with minimal required fields.""" + result = RollupResult(concurrency=100, output_tps=5000.0) + assert result.concurrency == 100 + assert result.output_tps == 5000.0 + assert result.mean_ttft_ms is None + assert result.total_tps is None + + def test_full_result(self): + """Test creating a result with all fields populated.""" + result = RollupResult( + concurrency=100, + output_tps=5000.0, + total_tps=6000.0, + request_throughput=50.0, + mean_ttft_ms=150.0, + mean_tpot_ms=20.0, + mean_itl_ms=18.0, + p99_ttft_ms=300.0, + p99_itl_ms=25.0, + total_input_tokens=100000, + total_output_tokens=200000, + duration=60.0, + completed=1000, + num_prompts=1000, + ) + assert result.concurrency == 100 + assert result.output_tps == 5000.0 + assert result.total_tps == 6000.0 + assert result.mean_ttft_ms == 150.0 + assert result.p99_ttft_ms == 300.0 + + +class TestRollupSummary: + """Tests for RollupSummary dataclass.""" + + def test_compute_summary_stats_empty(self): + """Test summary stats with no results.""" + summary = RollupSummary( + job_id="12345", + job_name="test-job", + generated_at="2025-01-22 10:00:00", + model_path="/models/test", + model_name="test-model", + precision="fp8", + gpu_type="B200", + gpus_per_node=8, + backend_type="sglang", + frontend_type="sglang", + is_disaggregated=True, + total_nodes=4, + total_gpus=32, + benchmark_type="sa-bench", + isl=1024, + osl=1024, + ) + summary.compute_summary_stats() + assert summary.max_output_tps is None + assert summary.min_mean_ttft_ms is None + + def test_compute_summary_stats_with_results(self): + """Test summary stats computation from results.""" + summary = RollupSummary( + job_id="12345", + job_name="test-job", + generated_at="2025-01-22 10:00:00", + model_path="/models/test", + model_name="test-model", + precision="fp8", + gpu_type="B200", + gpus_per_node=8, + backend_type="sglang", + frontend_type="sglang", + is_disaggregated=True, + total_nodes=4, + total_gpus=32, + benchmark_type="sa-bench", + isl=1024, + osl=1024, + results=[ + RollupResult(concurrency=50, output_tps=3000.0, mean_ttft_ms=100.0, mean_itl_ms=20.0), + RollupResult(concurrency=100, output_tps=5000.0, mean_ttft_ms=150.0, mean_itl_ms=25.0), + RollupResult(concurrency=200, output_tps=4500.0, mean_ttft_ms=250.0, mean_itl_ms=30.0), + ], + ) + summary.compute_summary_stats() + + assert summary.max_output_tps == 5000.0 + assert summary.min_mean_ttft_ms == 100.0 + assert summary.min_mean_itl_ms == 20.0 + + +class TestRollupStageMixin: + """Tests for RollupStageMixin functionality.""" + + def test_collect_benchmark_results(self, tmp_path): + """Test collecting benchmark results from directories.""" + # Create mock benchmark result directories + bench_dir = tmp_path / "sa-bench_isl_1024_osl_1024" + bench_dir.mkdir() + + # Create mock result JSONs + for concurrency in [50, 100, 200]: + result_file = bench_dir / f"result_c{concurrency}.json" + result_file.write_text( + json.dumps( + { + "max_concurrency": concurrency, + "output_throughput": 1000.0 * concurrency / 50, + "total_token_throughput": 1200.0 * concurrency / 50, + "mean_ttft_ms": 100.0 + concurrency, + "mean_itl_ms": 15.0 + concurrency / 10, + "request_rate": f"c{concurrency}", + } + ) + ) + + # Create a mock mixin instance + class MockBenchmarkConfig: + type = "sa-bench" + + class MockConfig: + benchmark = MockBenchmarkConfig() + + class MockOrchestrator(RollupStageMixin): + def __init__(self, log_dir): + self._log_dir = log_dir + + @property + def config(self): + return MockConfig() + + @property + def runtime(self): + class MockRuntime: + log_dir = self._log_dir + + return MockRuntime() + + @property + def endpoints(self): + return [] + + orchestrator = MockOrchestrator(tmp_path) + results = orchestrator._collect_benchmark_results() + + assert len(results) == 3 + # Results should be sorted by concurrency + assert results[0]["max_concurrency"] == 50 + assert results[1]["max_concurrency"] == 100 + assert results[2]["max_concurrency"] == 200 + + def test_collect_benchmark_results_empty(self, tmp_path): + """Test collecting when no benchmark results exist.""" + + class MockBenchmarkConfig: + type = "sa-bench" + + class MockConfig: + benchmark = MockBenchmarkConfig() + + class MockOrchestrator(RollupStageMixin): + def __init__(self, log_dir): + self._log_dir = log_dir + + @property + def config(self): + return MockConfig() + + @property + def runtime(self): + class MockRuntime: + log_dir = self._log_dir + + return MockRuntime() + + @property + def endpoints(self): + return [] + + orchestrator = MockOrchestrator(tmp_path) + results = orchestrator._collect_benchmark_results() + + assert len(results) == 0 + + def test_write_rollup(self, tmp_path): + """Test writing rollup summary to JSON.""" + summary = RollupSummary( + job_id="12345", + job_name="test-job", + generated_at="2025-01-22 10:00:00", + model_path="/models/test", + model_name="test-model", + precision="fp8", + gpu_type="B200", + gpus_per_node=8, + backend_type="sglang", + frontend_type="sglang", + is_disaggregated=True, + total_nodes=4, + total_gpus=32, + benchmark_type="sa-bench", + isl=1024, + osl=1024, + prefill_nodes=1, + decode_nodes=3, + prefill_workers=1, + decode_workers=3, + prefill_gpus=8, + decode_gpus=24, + results=[ + RollupResult(concurrency=100, output_tps=5000.0, mean_ttft_ms=150.0), + ], + tags=["test", "example"], + ) + summary.compute_summary_stats() + + class MockOrchestrator(RollupStageMixin): + @property + def runtime(self): + return None + + @property + def endpoints(self): + return [] + + orchestrator = MockOrchestrator() + rollup_path = tmp_path / "rollup.json" + orchestrator._write_rollup(summary, rollup_path) + + # Verify the file was written + assert rollup_path.exists() + + # Verify the content + with open(rollup_path) as f: + data = json.load(f) + + assert data["job_id"] == "12345" + assert data["job_name"] == "test-job" + assert data["model_name"] == "test-model" + assert data["is_disaggregated"] is True + assert data["total_gpus"] == 32 + assert data["prefill_nodes"] == 1 + assert data["decode_nodes"] == 3 + assert len(data["results"]) == 1 + assert data["results"][0]["concurrency"] == 100 + assert data["results"][0]["output_tps"] == 5000.0 + assert data["max_output_tps"] == 5000.0 + assert data["tags"] == ["test", "example"] + + +class TestRollupIntegration: + """Integration tests for rollup with full mock config.""" + + def test_full_rollup_workflow(self, tmp_path): + """Test the complete rollup workflow with mocked config.""" + from dataclasses import dataclass, field + + # Create mock benchmark results + bench_dir = tmp_path / "sa-bench_isl_1024_osl_1024" + bench_dir.mkdir() + + for concurrency in [50, 100, 200]: + result_file = bench_dir / f"result_c{concurrency}.json" + result_file.write_text( + json.dumps( + { + "max_concurrency": concurrency, + "output_throughput": 1000.0 * concurrency / 50, + "total_token_throughput": 1200.0 * concurrency / 50, + "mean_ttft_ms": 100.0 + concurrency, + "mean_itl_ms": 15.0 + concurrency / 10, + "p99_ttft_ms": 200.0 + concurrency * 2, + "p99_itl_ms": 30.0 + concurrency / 5, + "duration": 60.0, + "completed": concurrency * 10, + "num_prompts": concurrency * 10, + } + ) + ) + + # Create mock orchestrator with full config + @dataclass + class MockResourceConfig: + is_disaggregated: bool = True + prefill_gpus: int = 8 + decode_gpus: int = 24 + agg_nodes: int | None = None + gpus_per_node: int = 8 + gpu_type: str = "B200" + total_nodes: int = 4 + prefill_nodes: int = 1 + decode_nodes: int = 3 + num_prefill: int = 1 + num_decode: int = 3 + num_agg: int | None = None + + @dataclass + class MockBenchmarkConfig: + type: str = "sa-bench" + isl: int = 1024 + osl: int = 1024 + concurrencies: str = "50x100x200" + + def get_concurrency_list(self): + return [int(c) for c in self.concurrencies.split("x")] + + @dataclass + class MockModelConfig: + precision: str = "fp8" + + @dataclass + class MockFrontendConfig: + type: str = "sglang" + + @dataclass + class MockConfig: + name: str = "test-job" + served_model_name: str = "deepseek-v3" + backend_type: str = "sglang" + resources: MockResourceConfig = field(default_factory=MockResourceConfig) + benchmark: MockBenchmarkConfig = field(default_factory=MockBenchmarkConfig) + model: MockModelConfig = field(default_factory=MockModelConfig) + frontend: MockFrontendConfig = field(default_factory=MockFrontendConfig) + + @dataclass + class MockRuntime: + job_id: str = "12345" + log_dir: Path = field(default_factory=Path) + model_path: Path = field(default_factory=lambda: Path("/models/deepseek-v3")) + + class MockOrchestrator(RollupStageMixin): + def __init__(self, config, runtime): + self._config = config + self._runtime = runtime + + @property + def config(self): + return self._config + + @property + def runtime(self): + return self._runtime + + @property + def endpoints(self): + return [] + + config = MockConfig() + runtime = MockRuntime(log_dir=tmp_path) + orchestrator = MockOrchestrator(config, runtime) + + # Run rollup + rollup_path = orchestrator.run_rollup(tags=["integration-test"]) + + # Verify + assert rollup_path is not None + assert rollup_path.exists() + + with open(rollup_path) as f: + data = json.load(f) + + # Verify summary + assert data["job_id"] == "12345" + assert data["job_name"] == "test-job" + assert data["model_name"] == "deepseek-v3" + assert data["is_disaggregated"] is True + assert data["total_gpus"] == 32 # 8 + 24 + assert data["benchmark_type"] == "sa-bench" + assert data["isl"] == 1024 + assert data["osl"] == 1024 + assert data["concurrencies"] == [50, 100, 200] + + # Verify results + assert len(data["results"]) == 3 + assert data["max_output_tps"] == 4000.0 # 1000 * 200/50 + + # Verify tags + assert data["tags"] == ["integration-test"] + + def test_rollup_with_node_logs(self, tmp_path): + """Test rollup with actual node log files parsed by NodeAnalyzer.""" + from dataclasses import dataclass, field + + # Create mock benchmark results + bench_dir = tmp_path / "sa-bench_isl_1024_osl_1024" + bench_dir.mkdir() + + result_file = bench_dir / "result_c100.json" + result_file.write_text( + json.dumps( + { + "max_concurrency": 100, + "output_throughput": 5000.0, + "mean_ttft_ms": 150.0, + "mean_itl_ms": 20.0, + } + ) + ) + + # Create mock prefill log file (matches SGLang parser expected format) + prefill_log = tmp_path / "node-01_prefill_w0.err" + prefill_log.write_text( + """[2m2025-01-22T10:00:00.000000Z[0m [32m INFO[0m Prefill batch, #new-seq: 10, #new-token: 1024, #cached-token: 256, token usage: 0.50, #running-req: 5, #queue-req: 2, #prealloc-req: 0, #inflight-req: 3, input throughput (token/s): 5000.00, +[2m2025-01-22T10:00:01.000000Z[0m [32m INFO[0m Load weight end. type=DeepseekV3ForCausalLM, dtype=torch.bfloat16, avail mem=75.11 GB, mem usage=107.07 GB. +[2m2025-01-22T10:00:02.000000Z[0m [32m INFO[0m KV Cache is allocated. #tokens: 524288, KV size: 17.16 GB +""" + ) + + # Create mock decode log file + decode_log = tmp_path / "node-02_decode_w0.err" + decode_log.write_text( + """[2m2025-01-22T10:00:00.000000Z[0m [32m INFO[0m Decode batch, #running-req: 50, #token: 5000, token usage: 0.50, pre-allocated usage: 0.10, #prealloc-req: 2, #transfer-req: 1, #queue-req: 3, gen throughput (token/s): 150.00, +""" + ) + + # Mock config classes + @dataclass + class MockResourceConfig: + is_disaggregated: bool = True + prefill_gpus: int = 8 + decode_gpus: int = 8 + agg_nodes: int | None = None + gpus_per_node: int = 8 + gpu_type: str = "B200" + total_nodes: int = 2 + prefill_nodes: int = 1 + decode_nodes: int = 1 + num_prefill: int = 1 + num_decode: int = 1 + num_agg: int | None = None + + @dataclass + class MockBenchmarkConfig: + type: str = "sa-bench" + isl: int = 1024 + osl: int = 1024 + concurrencies: str = "100" + + def get_concurrency_list(self): + return [100] + + @dataclass + class MockModelConfig: + precision: str = "fp8" + + @dataclass + class MockFrontendConfig: + type: str = "sglang" + + @dataclass + class MockConfig: + name: str = "test-job" + served_model_name: str = "deepseek-v3" + backend_type: str = "sglang" + resources: MockResourceConfig = field(default_factory=MockResourceConfig) + benchmark: MockBenchmarkConfig = field(default_factory=MockBenchmarkConfig) + model: MockModelConfig = field(default_factory=MockModelConfig) + frontend: MockFrontendConfig = field(default_factory=MockFrontendConfig) + + @dataclass + class MockRuntime: + job_id: str = "12345" + log_dir: Path = field(default_factory=Path) + model_path: Path = field(default_factory=lambda: Path("/models/deepseek-v3")) + + class MockOrchestrator(RollupStageMixin): + def __init__(self, config, runtime): + self._config = config + self._runtime = runtime + + @property + def config(self): + return self._config + + @property + def runtime(self): + return self._runtime + + @property + def endpoints(self): + return [] + + config = MockConfig() + runtime = MockRuntime(log_dir=tmp_path) + orchestrator = MockOrchestrator(config, runtime) + + # Run rollup + rollup_path = orchestrator.run_rollup(tags=["node-test"]) + + assert rollup_path is not None + assert rollup_path.exists() + + with open(rollup_path) as f: + data = json.load(f) + + # Verify node summary is present + assert data["nodes_summary"] is not None + nodes_summary = data["nodes_summary"] + + assert nodes_summary["total_prefill_nodes"] == 1 + assert nodes_summary["total_decode_nodes"] == 1 + assert len(nodes_summary["nodes"]) == 2 + + # Find prefill and decode nodes + prefill_node = next((n for n in nodes_summary["nodes"] if n["worker_type"] == "prefill"), None) + decode_node = next((n for n in nodes_summary["nodes"] if n["worker_type"] == "decode"), None) + + assert prefill_node is not None + assert prefill_node["node_name"] == "node-01" + assert prefill_node["total_new_tokens"] == 1024 + assert prefill_node["total_cached_tokens"] == 256 + assert prefill_node["kv_cache_gb"] == 17.16 + + assert decode_node is not None + assert decode_node["node_name"] == "node-02" + assert decode_node["max_running_requests"] == 50 + assert decode_node["avg_gen_throughput"] == 150.0 +