diff --git a/.ci/scripts/cuda_benchmark.py b/.ci/scripts/cuda_benchmark.py new file mode 100644 index 00000000000..b135925d4b4 --- /dev/null +++ b/.ci/scripts/cuda_benchmark.py @@ -0,0 +1,939 @@ +""" +Benchmark script for CUDA model runners. +Runs model runner commands multiple times and collects performance metrics. +Supports whisper, voxtral, gemma3, and other CUDA models. +""" + +import argparse +import json +import statistics +import subprocess +import sys +from dataclasses import dataclass +from typing import List, Optional, Tuple + + +@dataclass +class RunMetrics: + """Metrics from a single run.""" + + generated_tokens: int + tokens_per_sec: float + model_load_time_ms: float + total_inference_time_ms: float + encoder_time_ms: float + generation_time_ms: float + first_token_latency_ms: float + + def __repr__(self): + return ( + f"Tokens: {self.generated_tokens}, " + f"Throughput: {self.tokens_per_sec:.2f} t/s, " + f"Model load: {self.model_load_time_ms:.0f}ms, " + f"Total inference: {self.total_inference_time_ms:.0f}ms, " + f"Encoder: {self.encoder_time_ms:.0f}ms, " + f"Generation: {self.generation_time_ms:.0f}ms, " + f"First token: {self.first_token_latency_ms:.0f}ms" + ) + + +def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]: + """Parse PyTorchObserver JSON output and compute metrics.""" + try: + # Find the JSON part in the log line + if "PyTorchObserver" not in log_line: + return None + + json_str = log_line.split("PyTorchObserver")[1].strip() + data = json.loads(json_str) + + # Extract values + generated_tokens = data.get("generated_tokens", 0) + inference_start_ms = data.get("inference_start_ms", 0) + inference_end_ms = data.get("inference_end_ms", 0) + prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0) + first_token_ms = data.get("first_token_ms", 0) + model_load_start_ms = data.get("model_load_start_ms", 0) + model_load_end_ms = data.get("model_load_end_ms", 0) + + # Compute metrics + # Total inference time: from inference start to inference end + total_inference_time_ms = inference_end_ms - inference_start_ms + + # Encoder time: from inference start to prompt evaluation end + encoder_time_ms = prompt_eval_end_ms - inference_start_ms + + # Generation time: from prompt evaluation end to inference end + generation_time_ms = inference_end_ms - prompt_eval_end_ms + + # Calculate throughput based on generation time + tokens_per_sec = ( + (generated_tokens / generation_time_ms * 1000) + if generation_time_ms > 0 + else 0 + ) + model_load_time_ms = model_load_end_ms - model_load_start_ms + first_token_latency_ms = first_token_ms - prompt_eval_end_ms + + return RunMetrics( + generated_tokens=generated_tokens, + tokens_per_sec=tokens_per_sec, + model_load_time_ms=model_load_time_ms, + total_inference_time_ms=total_inference_time_ms, + encoder_time_ms=encoder_time_ms, + generation_time_ms=generation_time_ms, + first_token_latency_ms=first_token_latency_ms, + ) + except (json.JSONDecodeError, KeyError, ValueError) as e: + print(f"Error parsing PyTorchObserver log: {e}", file=sys.stderr) + return None + + +def get_gpu_clocks() -> Optional[Tuple[str, str]]: + """Get current GPU and memory clock frequencies.""" + try: + # Get GPU clock + result_gpu = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=clocks.gr", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=10, + ) + # Get memory clock + result_mem = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=clocks.mem", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=10, + ) + + if result_gpu.returncode == 0 and result_mem.returncode == 0: + gpu_clock = result_gpu.stdout.strip().split("\n")[0] + mem_clock = result_mem.stdout.strip().split("\n")[0] + return gpu_clock, mem_clock + except Exception as e: + print(f"Warning: Failed to get GPU clocks: {e}", file=sys.stderr) + return None + + +def set_gpu_clocks(gpu_clock: Optional[int] = None) -> bool: + """ + Set GPU clock frequency to a fixed value. + + Args: + gpu_clock: Target GPU clock frequency in MHz. + If None, will use max available. + + Returns: + True if successful, False otherwise + """ + try: + print("\n[GPU Clock Setup] Fixing GPU clock frequency...") + + # Enable persistence mode + result = subprocess.run( + ["sudo", "nvidia-smi", "-pm", "1"], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode != 0: + print( + f"Warning: Failed to enable persistence mode: {result.stderr}", + file=sys.stderr, + ) + return False + print("✓ Enabled persistence mode") + + # Lock GPU clocks + if gpu_clock is None: + # Get max GPU clock + result = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=clocks.max.gr", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode == 0: + gpu_clock = int(result.stdout.strip().split("\n")[0]) + print(f"✓ Detected max GPU clock: {gpu_clock} MHz") + + # Lock GPU clock to the target frequency + result = subprocess.run( + ["sudo", "nvidia-smi", "-lgc", f"{gpu_clock},{gpu_clock}"], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode != 0: + print( + f"Warning: Failed to lock GPU clock: {result.stderr}", + file=sys.stderr, + ) + return False + + print(f"✓ Locked GPU clock to {gpu_clock} MHz") + return True + + except Exception as e: + print(f"Error: Failed to set GPU clocks: {e}", file=sys.stderr) + return False + + +def reset_gpu_clocks() -> bool: + """Reset GPU clock frequencies to default.""" + try: + print("\n[GPU Clock Cleanup] Resetting GPU clock frequency...") + + # Reset GPU clocks + result = subprocess.run( + ["sudo", "nvidia-smi", "-rgc"], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode != 0: + print( + f"Warning: Failed to reset GPU clock: {result.stderr}", + file=sys.stderr, + ) + return False + print("✓ Reset GPU clock to default") + + # Disable persistence mode + result = subprocess.run( + ["sudo", "nvidia-smi", "-pm", "0"], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode != 0: + print( + "Warning: Failed to disable persistence mode: " f"{result.stderr}", + file=sys.stderr, + ) + return False + print("✓ Disabled persistence mode") + + return True + + except Exception as e: + print(f"Error: Failed to reset GPU clocks: {e}", file=sys.stderr) + return False + + +def _print_warmup_info(warmup_runs: int) -> None: + """Print warmup phase information.""" + if warmup_runs > 0: + print(f"\n{'='*70}") + print(f"WARMUP PHASE: Running {warmup_runs} warmup iterations...") + print(f"{'='*70}") + + +def _print_benchmark_info( + actual_benchmark_runs: int, trim_count: int, num_runs: int +) -> None: + """Print benchmark phase information.""" + print(f"\n{'='*70}") + print(f"BENCHMARK PHASE: Running {actual_benchmark_runs} iterations") + print(f"Will trim top and bottom {trim_count} results (10% of {num_runs})") + print(f"Final statistics will be based on middle {num_runs} results") + print(f"{'='*70}") + + +def _run_single_iteration( + command: str, run_num: int, verbose: bool +) -> Optional[RunMetrics]: + """ + Run a single benchmark iteration and return metrics. + + Args: + command: Command to execute + run_num: Current run number + verbose: Print verbose output + + Returns: + RunMetrics if successful, None otherwise + """ + try: + # Run command and capture output + result = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + ) + + if result.returncode != 0: + print( + f"Error: Command failed with return code {result.returncode}", + file=sys.stderr, + ) + if result.stderr: + print(f"stderr: {result.stderr}", file=sys.stderr) + return None + + # Search for PyTorchObserver line in output + observer_line = None + for line in result.stdout.split("\n"): + if "PyTorchObserver" in line: + observer_line = line + break + + if observer_line is None: + print( + f"Warning: No PyTorchObserver output found in run {run_num}", + file=sys.stderr, + ) + if verbose: + print(f"stdout:\n{result.stdout}", file=sys.stderr) + return None + + # Parse and return metrics + metrics = parse_pytorch_observer_log(observer_line) + if metrics is None: + print( + f"Warning: Failed to parse metrics from run {run_num}", + file=sys.stderr, + ) + return None + + print(f"✓ {metrics}") + return metrics + + except subprocess.TimeoutExpired: + print(f"Error: Command timed out on run {run_num}", file=sys.stderr) + return None + except Exception as e: + print(f"Error on run {run_num}: {e}", file=sys.stderr) + return None + + +def run_model_benchmark( + command: str, + num_runs: int = 5, + warmup_runs: int = 0, + verbose: bool = False, +) -> List[RunMetrics]: + """ + Run the model runner command multiple times and collect metrics. + + For trimmed mean calculation, this function runs extra iterations + to ensure we can trim outliers. Based on num_runs, we calculate + trim_count = num_runs * 0.1, then run num_runs + 2*trim_count total + iterations. The top and bottom trim_count results will be discarded. + + Args: + command: Full command to run + num_runs: Number of benchmark runs requested by user (after trim) + warmup_runs: Number of warmup runs (results will be discarded) + verbose: Print detailed output + + Returns: + List of RunMetrics from benchmark runs (excluding warmup). + """ + # Calculate trim count and total runs + trim_count = int(num_runs * 0.1) + actual_benchmark_runs = num_runs + 2 * trim_count + total_runs = warmup_runs + actual_benchmark_runs + + # Print phase information + _print_warmup_info(warmup_runs) + _print_benchmark_info(actual_benchmark_runs, trim_count, num_runs) + + # Execute all runs + results = [] + for run_num in range(1, total_runs + 1): + is_warmup = run_num <= warmup_runs + phase = "Warmup" if is_warmup else "Benchmark" + benchmark_run_num = run_num - warmup_runs if not is_warmup else run_num + + # Print run header + if is_warmup: + print(f"\n[{phase} {run_num}/{warmup_runs}] Executing: {command}") + else: + print( + f"\n[{phase} {benchmark_run_num}/{actual_benchmark_runs}] " + f"Executing: {command}" + ) + + # Run iteration and collect metrics + metrics = _run_single_iteration(command, run_num, verbose) + if metrics is not None and not is_warmup: + results.append(metrics) + + return results + + +def calculate_trimmed_stats( + values: List[float], trim_count: int +) -> Tuple[List[float], float, float, float, float]: + """ + Calculate statistics on trimmed data. + + Args: + values: List of numeric values + trim_count: Number of values to trim from each end + + Returns: + Tuple of (trimmed_values, min, max, mean, stdev) + """ + if not values: + return [], 0.0, 0.0, 0.0, 0.0 + + # Sort values + sorted_values = sorted(values) + n = len(sorted_values) + + # Trim if we have enough data and trim_count > 0 + if trim_count > 0 and n > 2 * trim_count: + trimmed_values = sorted_values[trim_count : n - trim_count] + else: + trimmed_values = sorted_values + + # Calculate stats on trimmed data + min_val = min(trimmed_values) + max_val = max(trimmed_values) + mean_val = statistics.mean(trimmed_values) + stdev_val = statistics.stdev(trimmed_values) if len(trimmed_values) > 1 else 0.0 + + return trimmed_values, min_val, max_val, mean_val, stdev_val + + +@dataclass +class MetricStats: + """Statistics for a single metric with operations.""" + + name: str + mean: float + min_val: float + max_val: float + stdev: float + unit: str = "" + extra_info: dict | None = None + + def create_v3_record( + self, + model_name: str, + backend: str, + runner_name: str, + runner_type: str, + base_extra_info: dict, + ) -> dict: + """ + Create a v3 format record for this metric. + + Args: + model_name: Model name with quantization + backend: Backend name (e.g., "cuda-aoti") + runner_name: GPU device name + runner_type: CUDA driver version + base_extra_info: Base extra_info dict to copy + + Returns: + Complete v3 format metric record + """ + extra_stats = { + "min": self.min_val, + "max": self.max_val, + "stdev": self.stdev, + } + if self.extra_info: + extra_stats.update(self.extra_info) + + return { + "benchmark": { + "name": "ExecuTorch", + "mode": "inference", + "extra_info": base_extra_info.copy(), + }, + "model": { + "name": model_name, + "type": "OSS model", + "backend": backend, + }, + "metric": { + "name": self.name, + "benchmark_values": [self.mean], + "target_value": 0, + "extra_info": extra_stats, + }, + "runners": [{"name": runner_name, "type": runner_type}], + } + + def print_stats(self) -> None: + """Print formatted statistics for this metric.""" + # Determine precision based on metric type + is_throughput = "tokens" in self.name.lower() + precision = 2 if is_throughput else 0 + + # Format metric name for display + display_name = self.name.replace("_", " ").upper() + if self.unit: + display_name = f"{display_name} ({self.unit})" + + print(f"{display_name}:") + print(f" Min: {self.min_val:.{precision}f} {self.unit}") + print(f" Max: {self.max_val:.{precision}f} {self.unit}") + print(f" Mean: {self.mean:.{precision}f} {self.unit}") + print(f" Stdev: {self.stdev:.{precision}f} {self.unit}") + print() + + +@dataclass +class BenchmarkResults: + """Summary of benchmark results.""" + + model_name: str + total_runs: int + trimmed_runs: int + discarded_runs: int + generated_tokens: int + + # Metrics + throughput: MetricStats + model_load_time: MetricStats + total_inference_time: MetricStats + encoder_time: MetricStats + generation_time: MetricStats + first_token_latency: MetricStats + + def save_json(self, output_path: str) -> None: + """Save results to JSON file.""" + with open(output_path, "w") as f: + json.dump(self.to_dict(), f, indent=2) + print(f"\n✓ Results saved to: {output_path}") + + def to_dict(self) -> dict: + """Convert results to dictionary for JSON serialization.""" + return { + "model_name": self.model_name, + "total_runs": self.total_runs, + "trimmed_runs": self.trimmed_runs, + "discarded_runs": self.discarded_runs, + "generated_tokens": self.generated_tokens, + "throughput_mean": self.throughput.mean, + "throughput_min": self.throughput.min_val, + "throughput_max": self.throughput.max_val, + "throughput_stdev": self.throughput.stdev, + "model_load_time_mean": self.model_load_time.mean, + "model_load_time_min": self.model_load_time.min_val, + "model_load_time_max": self.model_load_time.max_val, + "model_load_time_stdev": self.model_load_time.stdev, + "total_inference_time_mean": self.total_inference_time.mean, + "total_inference_time_min": self.total_inference_time.min_val, + "total_inference_time_max": self.total_inference_time.max_val, + "total_inference_time_stdev": self.total_inference_time.stdev, + "encoder_time_mean": self.encoder_time.mean, + "encoder_time_min": self.encoder_time.min_val, + "encoder_time_max": self.encoder_time.max_val, + "encoder_time_stdev": self.encoder_time.stdev, + "generation_time_mean": self.generation_time.mean, + "generation_time_min": self.generation_time.min_val, + "generation_time_max": self.generation_time.max_val, + "generation_time_stdev": self.generation_time.stdev, + "first_token_latency_mean": self.first_token_latency.mean, + "first_token_latency_min": self.first_token_latency.min_val, + "first_token_latency_max": self.first_token_latency.max_val, + "first_token_latency_stdev": self.first_token_latency.stdev, + } + + def to_v3_format( + self, + model: str, + quantization: str, + git_sha: str, + workflow_run_id: str, + workflow_run_url: str = "", + gpu_name: str = "CUDA", + cuda_driver_version: str = "cuda", + ) -> List[dict]: + """ + Transform benchmark results to PyTorch benchmark database v3 format. + + Args: + model: Model name (e.g., "openai/whisper-small") + quantization: Quantization type (e.g., "non-quantized") + git_sha: Git commit SHA + workflow_run_id: GitHub workflow run ID + workflow_run_url: GitHub workflow run URL + gpu_name: GPU device name (e.g., "Tesla V100", "A100") + cuda_driver_version: CUDA driver version (e.g., "12.6", "535.104.05") + + Returns: + List of benchmark records in v3 format + """ + # Shared configuration + model_name_with_quant = f"{model}_{quantization}" + backend = "cuda-aoti" + runner_name = gpu_name + runner_type = cuda_driver_version + + # Create base extra_info + base_extra_info = { + "backend": "cuda", + "quantization": quantization, + "git_sha": git_sha, + "workflow_run_id": workflow_run_id, + } + if workflow_run_url: + base_extra_info["workflow_run_url"] = workflow_run_url + + # Create v3 records for all metrics + return [ + self.throughput.create_v3_record( + model_name_with_quant, + backend, + runner_name, + runner_type, + base_extra_info, + ), + self.model_load_time.create_v3_record( + model_name_with_quant, + backend, + runner_name, + runner_type, + base_extra_info, + ), + self.total_inference_time.create_v3_record( + model_name_with_quant, + backend, + runner_name, + runner_type, + base_extra_info, + ), + self.encoder_time.create_v3_record( + model_name_with_quant, + backend, + runner_name, + runner_type, + base_extra_info, + ), + self.generation_time.create_v3_record( + model_name_with_quant, + backend, + runner_name, + runner_type, + base_extra_info, + ), + self.first_token_latency.create_v3_record( + model_name_with_quant, + backend, + runner_name, + runner_type, + base_extra_info, + ), + ] + + +def compute_summary( + model_name: str, results: List[RunMetrics], requested_runs: int +) -> BenchmarkResults: + """ + Compute summary statistics using trimmed data. + + All statistics (min, max, mean, stdev) are calculated based on + the trimmed dataset after removing outliers. + + Args: + model_name: Name of the model being benchmarked + results: List of all collected run metrics + requested_runs: Number of runs originally requested by user + + Returns: + BenchmarkResults object with all computed statistics + """ + if not results: + raise ValueError("No valid results to summarize.") + + # Calculate trim count based on requested runs (not actual runs) + trim_count = int(requested_runs * 0.1) + + # Helper to create MetricStats from values + def create_metric_stats( + name: str, values: List[float], unit: str = "", extra_info: dict | None = None + ) -> MetricStats: + _, min_val, max_val, mean_val, stdev_val = calculate_trimmed_stats( + values, trim_count + ) + return MetricStats( + name=name, + mean=mean_val, + min_val=min_val, + max_val=max_val, + stdev=stdev_val, + unit=unit, + extra_info=extra_info, + ) + + # Get the first trimmed result to get trimmed_runs count + trimmed_throughput, _, _, _, _ = calculate_trimmed_stats( + [r.tokens_per_sec for r in results], trim_count + ) + + return BenchmarkResults( + model_name=model_name, + total_runs=len(results), + trimmed_runs=len(trimmed_throughput), + discarded_runs=trim_count * 2, + generated_tokens=results[0].generated_tokens, + throughput=create_metric_stats( + "throughput(tokens/sec)", + [r.tokens_per_sec for r in results], + "t/s", + {"trimmed_runs": len(trimmed_throughput)}, + ), + model_load_time=create_metric_stats( + "model_load_time(ms)", + [r.model_load_time_ms for r in results], + "ms", + ), + total_inference_time=create_metric_stats( + "total_inference_time(ms)", + [r.total_inference_time_ms for r in results], + "ms", + ), + encoder_time=create_metric_stats( + "encoder_time(ms)", + [r.encoder_time_ms for r in results], + "ms", + ), + generation_time=create_metric_stats( + "generation_time(ms)", + [r.generation_time_ms for r in results], + "ms", + ), + first_token_latency=create_metric_stats( + "first_token_latency(ms)", + [r.first_token_latency_ms for r in results], + "ms", + ), + ) + + +def print_summary(summary: BenchmarkResults) -> None: + """Print formatted summary of benchmark results.""" + print("\n" + "=" * 70) + print(f"BENCHMARK SUMMARY for model: {summary.model_name}") + print("=" * 70) + print(f"Total runs collected: {summary.total_runs}") + print(f"Trimmed to: {summary.trimmed_runs} runs") + print( + f"(Discarded {summary.discarded_runs // 2} highest and " + f"{summary.discarded_runs // 2} lowest results)" + ) + print(f"Generated tokens per run: {summary.generated_tokens}") + print() + + # Print all metrics using their print_stats method + summary.throughput.print_stats() + summary.model_load_time.print_stats() + summary.total_inference_time.print_stats() + summary.encoder_time.print_stats() + summary.generation_time.print_stats() + summary.first_token_latency.print_stats() + + print("=" * 70) + + +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser( + description="Benchmark CUDA model runners and collect performance metrics" + ) + parser.add_argument( + "--runner_command", + type=str, + required=True, + help="Full command to run the model runner", + ) + parser.add_argument( + "--model_name", + type=str, + required=True, + help="Name of the model being benchmarked", + ) + parser.add_argument( + "--num_runs", + type=int, + default=50, + help="Number of benchmark runs (default: 50)", + ) + parser.add_argument( + "--warmup_runs", + type=int, + default=0, + help="Number of warmup runs before benchmark (default: 0.1 * num_runs)", + ) + parser.add_argument( + "--fix_gpu_clock", + type=bool, + default=True, + help="Fix GPU clock frequency to maximum before benchmarking", + ) + parser.add_argument( + "--gpu_clock", + type=int, + default=None, + help="Target GPU clock frequency in MHz (requires " + "--fix_gpu_clock). If not specified, uses max available.", + ) + parser.add_argument( + "--output_json", + type=str, + default=None, + help="Path to save JSON results", + ) + parser.add_argument( + "--output_v3", + type=str, + default=None, + help="Path to save v3 format JSON results for dashboard", + ) + parser.add_argument( + "--model", + type=str, + default=None, + help="Model ID (e.g., 'openai/whisper-small') - required for v3 format", + ) + parser.add_argument( + "--quantization", + type=str, + default=None, + help="Quantization type (e.g., 'non-quantized') - required for v3 format", + ) + parser.add_argument( + "--git_sha", + type=str, + default=None, + help="Git commit SHA - required for v3 format", + ) + parser.add_argument( + "--workflow_run_id", + type=str, + default=None, + help="GitHub workflow run ID - required for v3 format", + ) + parser.add_argument( + "--workflow_run_url", + type=str, + default="", + help="GitHub workflow run URL - optional for v3 format", + ) + parser.add_argument( + "--gpu_name", + type=str, + default=None, + help="GPU device name (e.g., 'Tesla V100', 'A100') - optional for v3 format", + ) + parser.add_argument( + "--cuda_driver_version", + type=str, + default=None, + help="CUDA driver version (e.g., '12.6', '535.104.05') - optional for v3 format", + ) + parser.add_argument("--verbose", action="store_true", help="Print verbose output") + + args = parser.parse_args() + + warmup_runs = ( + int(0.1 * args.num_runs) if args.warmup_runs == 0 else args.warmup_runs + ) + + print(f"Running benchmark for model: {args.model_name}") + print(f"Number of runs: {args.num_runs}") + if warmup_runs > 0: + print(f"Warmup runs: {warmup_runs}") + if args.fix_gpu_clock: + clock_str = f"{args.gpu_clock}" if args.gpu_clock else "max available" + print(f"GPU clock will be fixed to: {clock_str} MHz") + print(f"Command: {args.runner_command}\n") + + # Fix GPU clocks if requested + gpu_clock_fixed = False + if args.fix_gpu_clock: + # Get current clocks before fixing + initial_clocks = get_gpu_clocks() + if initial_clocks: + print( + f"Current GPU clocks - GPU: {initial_clocks[0]} MHz, " + f"Memory: {initial_clocks[1]} MHz" + ) + + gpu_clock_fixed = set_gpu_clocks(args.gpu_clock) + if not gpu_clock_fixed: + print( + "Warning: Failed to fix GPU clocks. " + "Continuing without fixed clocks...", + file=sys.stderr, + ) + + try: + # Run benchmark + results = run_model_benchmark( + command=args.runner_command, + num_runs=args.num_runs, + warmup_runs=warmup_runs, + verbose=args.verbose, + ) + + # Compute and print summary + summary = compute_summary(args.model_name, results, args.num_runs) + print_summary(summary) + + # Save JSON results if requested + if args.output_json: + summary.save_json(args.output_json) + + # Save v3 format if requested + if args.output_v3: + # Validate required parameters for v3 format + if not all( + [args.model, args.quantization, args.git_sha, args.workflow_run_id] + ): + print( + "Error: --output_v3 requires --model, --quantization, " + "--git_sha, and --workflow_run_id", + file=sys.stderr, + ) + sys.exit(1) + + v3_records = summary.to_v3_format( + model=args.model, + quantization=args.quantization, + git_sha=args.git_sha, + workflow_run_id=args.workflow_run_id, + workflow_run_url=args.workflow_run_url, + gpu_name=args.gpu_name if args.gpu_name else "UNKNOWN GPU", + cuda_driver_version=( + args.cuda_driver_version if args.cuda_driver_version else "cuda" + ), + ) + + with open(args.output_v3, "w") as f: + json.dump(v3_records, f, indent=2) + + print(f"✓ v3 format results saved to: {args.output_v3}") + print(f"✓ Generated {len(v3_records)} v3 records for dashboard upload") + + finally: + # Reset GPU clocks if they were fixed + if gpu_clock_fixed: + reset_gpu_clocks() + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/trigger_cuda_perf.sh b/.github/scripts/trigger_cuda_perf.sh new file mode 100755 index 00000000000..402dd009673 --- /dev/null +++ b/.github/scripts/trigger_cuda_perf.sh @@ -0,0 +1,100 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Quick script to trigger cuda-perf workflow via GitHub CLI +# Usage: +# ./trigger_cuda_perf.sh # Use defaults (random model + quant) +# ./trigger_cuda_perf.sh --all # Run ALL models with ALL quantizations +# ./trigger_cuda_perf.sh "openai/whisper-medium" # Single model +# ./trigger_cuda_perf.sh "openai/whisper-small,google/gemma-3-4b-it" "non-quantized,quantized-int4-tile-packed" "100" + +set -e + +# All available models and quantizations +ALL_MODELS="mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it" +ALL_QUANTIZATIONS="non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only" + +# Check if gh CLI is installed +if ! command -v gh &> /dev/null; then + echo "Error: GitHub CLI (gh) is not installed." + echo "Install it from: https://cli.github.com/" + echo "" + echo "Quick install:" + echo " macOS: brew install gh" + echo " Linux: See https://github.com/cli/cli/blob/trunk/docs/install_linux.md" + exit 1 +fi + +# Check for --all flag +RUN_ALL=false +if [ "${1:-}" = "--all" ] || [ "${1:-}" = "-a" ]; then + RUN_ALL=true + shift # Remove the flag from arguments +fi + +# Default parameters +if [ "$RUN_ALL" = true ]; then + MODELS="$ALL_MODELS" + QUANT="$ALL_QUANTIZATIONS" + NUM_RUNS="${1:-50}" + RANDOM_MODEL="false" + echo "=========================================" + echo "Triggering cuda-perf workflow" + echo "Mode: RUN ALL MODELS AND QUANTIZATIONS" + echo "=========================================" + echo "Models: ALL (5 models)" + echo "Quantizations: ALL (3 quantizations)" + echo "Total configs: 15 combinations" + echo "Num runs: $NUM_RUNS" + echo "=========================================" +else + MODELS="${1:-}" + QUANT="${2:-}" + NUM_RUNS="${3:-50}" + RANDOM_MODEL="${4:-false}" + + # Display configuration + echo "=========================================" + echo "Triggering cuda-perf workflow" + echo "=========================================" + if [ -z "$MODELS" ]; then + echo "Models: (random selection)" + else + echo "Models: $MODELS" + fi + if [ -z "$QUANT" ]; then + echo "Quantizations: (random selection)" + else + echo "Quantizations: $QUANT" + fi + echo "Num runs: $NUM_RUNS" + echo "Random model: $RANDOM_MODEL" + echo "=========================================" +fi + +echo "" + +# Trigger workflow +gh workflow run cuda-perf.yml \ + -R pytorch/executorch \ + -f models="$MODELS" \ + -f quantizations="$QUANT" \ + -f num_runs="$NUM_RUNS" \ + -f random_model="$RANDOM_MODEL" + +if [ $? -eq 0 ]; then + echo "✓ Workflow triggered successfully!" + echo "" + echo "View status:" + echo " gh run list --workflow=cuda-perf.yml" + echo "" + echo "Watch the latest run:" + echo " gh run watch \$(gh run list --workflow=cuda-perf.yml --limit 1 --json databaseId --jq '.[0].databaseId')" +else + echo "✗ Failed to trigger workflow" + exit 1 +fi diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml new file mode 100644 index 00000000000..13efd9eff84 --- /dev/null +++ b/.github/workflows/cuda-perf.yml @@ -0,0 +1,420 @@ +name: cuda-perf + +on: + schedule: + - cron: 0 8 * * * # 1am PST (8am UTC) + pull_request: + paths: + - .github/workflows/cuda-perf.yml + - .ci/scripts/cuda_benchmark.py + - .ci/scripts/export_model_artifact.sh + - .ci/scripts/test_model_e2e.sh + push: + branches: + - main + paths: + - .github/workflows/cuda-perf.yml + - .ci/scripts/cuda_benchmark.py + - .ci/scripts/export_model_artifact.sh + - .ci/scripts/test_model_e2e.sh + workflow_dispatch: + inputs: + models: + description: Models to be benchmarked (comma-separated HuggingFace model IDs) + required: false + type: string + default: openai/whisper-small + quantizations: + description: Quantization types (comma-separated) + required: false + type: string + default: non-quantized + num_runs: + description: Number of benchmark runs per model + required: false + type: string + default: "50" + random_model: + description: Run a random model instead of all models + required: false + type: boolean + default: false + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + set-parameters: + runs-on: ubuntu-22.04 + outputs: + benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }} + steps: + - uses: actions/checkout@v3 + with: + submodules: 'false' + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Set parameters + id: set-parameters + shell: bash + env: + # All available models and quantizations + ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it' + ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only' + NUM_RUNS: ${{ inputs.num_runs || '50' }} + RANDOM_MODEL: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && 'true' || inputs.random_model || 'false' }} + run: | + set -eux + + MODELS="${{ inputs.models }}" + QUANTIZATIONS="${{ inputs.quantizations }}" + + # For non-schedule events (PR, manual trigger without inputs), randomly select one model and one quantization + if [ -z "$MODELS" ] && [ "${{ github.event_name }}" != "schedule" ]; then + # Split all models into array + IFS=',' read -ra ALL_MODEL_ARRAY <<< "$ALL_MODELS" + # Randomly select one model + RANDOM_MODEL_INDEX=$((RANDOM % ${#ALL_MODEL_ARRAY[@]})) + MODELS="${ALL_MODEL_ARRAY[$RANDOM_MODEL_INDEX]}" + echo "Randomly selected model for PR/push: $MODELS" + elif [ -z "$MODELS" ]; then + # Schedule event: use all models + MODELS="$ALL_MODELS" + fi + + if [ -z "$QUANTIZATIONS" ] && [ "${{ github.event_name }}" != "schedule" ]; then + # Split all quantizations into array + IFS=',' read -ra ALL_QUANT_ARRAY <<< "$ALL_QUANTIZATIONS" + # Randomly select one quantization + RANDOM_QUANT_INDEX=$((RANDOM % ${#ALL_QUANT_ARRAY[@]})) + QUANTIZATIONS="${ALL_QUANT_ARRAY[$RANDOM_QUANT_INDEX]}" + echo "Randomly selected quantization for PR/push: $QUANTIZATIONS" + elif [ -z "$QUANTIZATIONS" ]; then + # Schedule event: use all quantizations + QUANTIZATIONS="$ALL_QUANTIZATIONS" + fi + + # Split models and quantizations into arrays + IFS=',' read -ra MODEL_ARRAY <<< "$MODELS" + IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS" + + # If random model is requested (for main branch push), select one random model from the already selected models + if [ "$RANDOM_MODEL" = "true" ] && [ ${#MODEL_ARRAY[@]} -gt 1 ]; then + RANDOM_INDEX=$((RANDOM % ${#MODEL_ARRAY[@]})) + MODELS="${MODEL_ARRAY[$RANDOM_INDEX]}" + MODEL_ARRAY=("$MODELS") + echo "Random model selected for main branch push: $MODELS" + fi + + # Generate benchmark configs + CONFIGS='{"include":[' + FIRST=true + for MODEL in "${MODEL_ARRAY[@]}"; do + for QUANT in "${QUANT_ARRAY[@]}"; do + if [ "$FIRST" = true ]; then + FIRST=false + else + CONFIGS+=',' + fi + # Sanitize model name for use in artifact paths + MODEL_SAFE=$(echo "$MODEL" | sed 's/\//_/g') + CONFIGS+="{\"model\":\"$MODEL\",\"quant\":\"$QUANT\",\"model_safe\":\"$MODEL_SAFE\",\"num_runs\":\"$NUM_RUNS\"}" + done + done + CONFIGS+=']}' + + echo "benchmark_configs=$CONFIGS" >> $GITHUB_OUTPUT + echo "Generated benchmark configs:" + echo "$CONFIGS" | python -m json.tool + + export-models: + name: export-models + needs: set-parameters + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + secrets: inherit + strategy: + matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} + fail-fast: false + with: + timeout: 90 + secrets-env: EXECUTORCH_HF_TOKEN + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + use-custom-docker-registry: false + submodules: recursive + upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + echo "::group::Setup ExecuTorch" + ./install_executorch.sh + echo "::endgroup::" + + echo "::group::Setup Huggingface" + pip install -U "huggingface_hub[cli]<1.0" accelerate + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + echo "::endgroup::" + + echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}" + OUTPUT_DIR="model_artifacts" + mkdir -p "$OUTPUT_DIR" + + bash .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" "$OUTPUT_DIR" + + # Move artifacts to RUNNER_ARTIFACT_DIR for upload + mv "$OUTPUT_DIR"/* "${RUNNER_ARTIFACT_DIR}/" + ls -lah "${RUNNER_ARTIFACT_DIR}" + echo "::endgroup::" + + benchmark-cuda: + name: benchmark-cuda + needs: + - set-parameters + - export-models + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} + fail-fast: false + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + use-custom-docker-registry: false + submodules: recursive + download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }} + upload-artifact: results-${{ matrix.model_safe }}-${{ matrix.quant }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + echo "::group::Setup environment" + ./install_requirements.sh + pip list + echo "::endgroup::" + + echo "::group::Prepare model artifacts" + mkdir -p model_artifacts + cp "${RUNNER_ARTIFACT_DIR}/model.pte" model_artifacts/model.pte + cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" model_artifacts/aoti_cuda_blob.ptd + + # Copy additional files if they exist + if [ -f "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" ]; then + cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" model_artifacts/ + fi + if [ -f "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" ]; then + cp "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" model_artifacts/ + fi + if [ -f "${RUNNER_ARTIFACT_DIR}/tekken.json" ]; then + cp "${RUNNER_ARTIFACT_DIR}/tekken.json" model_artifacts/ + fi + if [ -f "${RUNNER_ARTIFACT_DIR}/poem.wav" ]; then + cp "${RUNNER_ARTIFACT_DIR}/poem.wav" model_artifacts/ + fi + if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then + cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/ + fi + # Copy tokenizer files + for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do + if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then + cp "${RUNNER_ARTIFACT_DIR}/$file" model_artifacts/ + fi + done + + ls -lah model_artifacts/ + echo "::endgroup::" + + echo "::group::Build runner" + bash .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" model_artifacts + echo "::endgroup::" + + echo "::group::Running benchmark for ${{ matrix.model }} (${{ matrix.quant }}) with ${{ matrix.num_runs }} runs" + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + + # Get GPU name using nvidia-smi + GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1) + echo "Detected GPU: $GPU_NAME" + + # Get CUDA driver version + CUDA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1) + echo "CUDA Driver Version: $CUDA_DRIVER_VERSION" + + # Create results directory + RESULTS_DIR="${RUNNER_ARTIFACT_DIR}" + mkdir -p "$RESULTS_DIR" + + # Determine model name and runner command based on model + case "${{ matrix.model }}" in + mistralai/Voxtral-Mini-3B-2507) + RUNNER="cmake-out/examples/models/voxtral/voxtral_runner" + PREPROCESSOR="model_artifacts/voxtral_preprocessor.pte" + TOKENIZER="model_artifacts/tekken.json" + AUDIO="model_artifacts/poem.wav" + RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0" + MODEL_NAME="voxtral_${{ matrix.quant }}" + ;; + openai/whisper-*) + RUNNER="cmake-out/examples/models/whisper/whisper_runner" + PREPROCESSOR="model_artifacts/whisper_preprocessor.pte" + AUDIO="model_artifacts/output.wav" + RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0" + MODEL_NAME=$(echo "${{ matrix.model }}" | sed 's/openai\///')_${{ matrix.quant }} + ;; + google/gemma-3-4b-it) + RUNNER="cmake-out/examples/models/gemma3/gemma3_e2e_runner" + IMAGE="docs/source/_static/img/et-logo.png" + RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0" + MODEL_NAME="gemma3_${{ matrix.quant }}" + ;; + *) + echo "Error: Unsupported model '${{ matrix.model }}'" + exit 1 + ;; + esac + + # Run benchmark using cuda_benchmark.py + python .ci/scripts/cuda_benchmark.py \ + --runner_command "$RUNNER_CMD" \ + --model_name "$MODEL_NAME" \ + --num_runs "${{ matrix.num_runs }}" \ + --output_json "$RESULTS_DIR/benchmark_results.json" \ + --output_v3 "$RESULTS_DIR/benchmark_results_v3.json" \ + --model "${{ matrix.model }}" \ + --quantization "${{ matrix.quant }}" \ + --git_sha "${{ github.sha }}" \ + --workflow_run_id "${{ github.run_id }}" \ + --workflow_run_url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ + --gpu_name "$GPU_NAME" \ + --cuda_driver_version "$CUDA_DRIVER_VERSION" + + # Save additional metadata + cat > "$RESULTS_DIR/metadata.json" < -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace { - -using executorch::aten::ScalarType; -using executorch::aten::Tensor; -using executorch::extension::make_tensor_ptr; -using executorch::extension::TensorPtr; -using executorch::extension::module::Module; -using executorch::runtime::Error; -using executorch::runtime::EValue; -using executorch::runtime::Result; -using Clock = std::chrono::steady_clock; -using executorch::aten::TensorShapeDynamism; -using DurationMs = std::chrono::duration; - -enum class ModelType { GEMMA3, VOXTRAL, UNKNOWN }; - -struct ModelConfig { - std::string name; - size_t token_seq_len; - size_t text_embed_dim; - std::vector expected_methods; -}; - -const std::map model_configs = { - {ModelType::GEMMA3, - {"gemma3", - 128, - 2304, - {"vision_encoder", "token_embedding", "text_decoder"}}}, - {ModelType::VOXTRAL, - {"voxtral", - 1138, - 3072, - {"audio_encoder", "token_embedding", "text_decoder"}}}}; - -ModelType parse_model_type(const std::string& model_name) { - std::string lower_name = model_name; - std::transform( - lower_name.begin(), - lower_name.end(), - lower_name.begin(), - [](unsigned char c) { return std::tolower(c); }); - - if (lower_name.find("gemma3") != std::string::npos || - lower_name.find("gemma-3") != std::string::npos) { - return ModelType::GEMMA3; - } else if (lower_name.find("voxtral") != std::string::npos) { - return ModelType::VOXTRAL; - } - return ModelType::UNKNOWN; -} - -std::vector to_sizes( - std::initializer_list dims) { - return std::vector(dims.begin(), dims.end()); -} - -std::string format_shape(const Tensor& tensor) { - std::ostringstream oss; - oss << "["; - const auto& sizes = tensor.sizes(); - for (size_t i = 0; i < sizes.size(); ++i) { - if (i > 0) { - oss << ", "; - } - oss << sizes[i]; - } - oss << "]"; - return oss.str(); -} - -void print_tensor_summary(const std::string& label, const Tensor& tensor) { - std::cout << " " << label - << ": dtype=" << executorch::runtime::toString(tensor.scalar_type()) - << ", shape=" << format_shape(tensor) - << ", numel=" << tensor.numel() << std::endl; -} - -void dump_tensor_to_file(const std::string& filename, const Tensor& tensor) { - std::ofstream file(filename, std::ios::binary); - if (!file.is_open()) { - std::cerr << "Failed to open file for writing: " << filename << std::endl; - return; - } - - int32_t dtype = static_cast(tensor.scalar_type()); - file.write(reinterpret_cast(&dtype), sizeof(int32_t)); - - int32_t ndim = static_cast(tensor.sizes().size()); - file.write(reinterpret_cast(&ndim), sizeof(int32_t)); - - for (size_t i = 0; i < tensor.sizes().size(); ++i) { - int64_t dim_size = tensor.sizes()[i]; - file.write(reinterpret_cast(&dim_size), sizeof(int64_t)); - } - - const void* data_ptr = tensor.const_data_ptr(); - size_t element_size = 0; - - switch (tensor.scalar_type()) { - case ScalarType::Float: - element_size = sizeof(float); - break; - case ScalarType::BFloat16: - element_size = 2; - break; - case ScalarType::Half: - element_size = 2; - break; - case ScalarType::Long: - element_size = sizeof(int64_t); - break; - case ScalarType::Int: - element_size = sizeof(int32_t); - break; - default: - std::cerr << "Unsupported dtype for dumping: " - << executorch::runtime::toString(tensor.scalar_type()) - << std::endl; - return; - } - - size_t data_size = tensor.numel() * element_size; - file.write(reinterpret_cast(data_ptr), data_size); - file.close(); - - std::cout << "Dumped tensor to: " << filename << std::endl; -} - -TensorPtr create_vision_input() { - const auto sizes = to_sizes({1, 3, 896, 896}); - const size_t numel = 1ull * 3ull * 896ull * 896ull; - std::vector data(numel); - for (size_t i = 0; i < numel; ++i) { - data[i] = static_cast((i % 255) / 255.0); - } - return make_tensor_ptr( - sizes, - std::move(data), - {}, - {}, - ScalarType::BFloat16, - TensorShapeDynamism::DYNAMIC_UNBOUND); -} - -TensorPtr create_audio_input() { - const auto sizes = to_sizes({3, 128, 3000}); - const size_t numel = 3ull * 128ull * 3000ull; - std::vector data(numel, 0.5f); - return make_tensor_ptr( - sizes, std::move(data), {}, {}, ScalarType::BFloat16); -} - -TensorPtr create_token_ids_input(const ModelConfig& config) { - const auto sizes = to_sizes({1, static_cast(config.token_seq_len)}); - std::vector data(config.token_seq_len); - for (size_t i = 0; i < config.token_seq_len; ++i) { - data[i] = static_cast(i + 1); - } - return make_tensor_ptr(sizes, std::move(data)); -} - -TensorPtr create_positions_input(const ModelConfig& config) { - const auto sizes = to_sizes({static_cast(config.token_seq_len)}); - std::vector data(config.token_seq_len); - for (size_t i = 0; i < config.token_seq_len; ++i) { - data[i] = static_cast(i); - } - return make_tensor_ptr(sizes, std::move(data)); -} - -TensorPtr create_fallback_text_embedding(const ModelConfig& config) { - const auto sizes = to_sizes( - {1, - static_cast(config.token_seq_len), - static_cast(config.text_embed_dim)}); - const size_t numel = 1ull * config.token_seq_len * config.text_embed_dim; - std::vector data(numel, 0.0f); - return make_tensor_ptr( - sizes, std::move(data), {}, {}, ScalarType::BFloat16); -} - -struct MethodTiming { - double load_ms{0.0}; - double run_ms{0.0}; -}; - -enum class MethodCategory { ENCODER, TOKEN_EMBEDDING, TEXT_DECODER, UNKNOWN }; - -MethodCategory categorize_method(const std::string& method_name) { - std::string lower_name = method_name; - std::transform( - lower_name.begin(), - lower_name.end(), - lower_name.begin(), - [](unsigned char c) { return std::tolower(c); }); - - if (lower_name.find("vision") != std::string::npos || - lower_name.find("audio") != std::string::npos || - lower_name.find("encoder") != std::string::npos) { - return MethodCategory::ENCODER; - } else if ( - lower_name.find("token") != std::string::npos && - lower_name.find("embedding") != std::string::npos) { - return MethodCategory::TOKEN_EMBEDDING; - } else if ( - lower_name.find("text") != std::string::npos && - lower_name.find("decoder") != std::string::npos) { - return MethodCategory::TEXT_DECODER; - } - return MethodCategory::UNKNOWN; -} - -std::vector create_inputs_for_method( - const std::string& method_name, - MethodCategory category, - ModelType model_type, - const ModelConfig& config, - const EValue* token_output, - std::vector& owned_inputs) { - std::vector inputs; - - switch (category) { - case MethodCategory::ENCODER: { - if (method_name.find("vision") != std::string::npos) { - auto input = create_vision_input(); - owned_inputs.emplace_back(input); - inputs.emplace_back(*input); - } else if (method_name.find("audio") != std::string::npos) { - auto input = create_audio_input(); - owned_inputs.emplace_back(input); - inputs.emplace_back(*input); - } - break; - } - - case MethodCategory::TOKEN_EMBEDDING: { - auto token_ids = create_token_ids_input(config); - owned_inputs.emplace_back(token_ids); - inputs.emplace_back(*token_ids); - break; - } - - case MethodCategory::TEXT_DECODER: { - if (token_output && token_output->isTensor()) { - inputs.emplace_back(*token_output); - } else { - auto fallback_embedding = create_fallback_text_embedding(config); - owned_inputs.emplace_back(fallback_embedding); - inputs.emplace_back(*fallback_embedding); - } - - auto positions = create_positions_input(config); - owned_inputs.emplace_back(positions); - inputs.emplace_back(*positions); - break; - } - - default: - break; - } - - return inputs; -} - -Error execute_method( - Module& module, - const std::string& method_name, - MethodCategory category, - ModelType model_type, - const ModelConfig& config, - const EValue* token_output, - MethodTiming& timing, - EValue* output_storage = nullptr) { - ET_LOG(Info, "Loading %s...", method_name.c_str()); - - const auto load_start = Clock::now(); - const Error load_err = module.load_method(method_name); - const auto load_end = Clock::now(); - if (load_err != Error::Ok) { - std::cerr << "Failed to load method " << method_name << ": error code " - << static_cast(load_err) << std::endl; - return load_err; - } - timing.load_ms = DurationMs(load_end - load_start).count(); - - std::vector owned_inputs; - std::vector inputs = create_inputs_for_method( - method_name, category, model_type, config, token_output, owned_inputs); - - const auto run_start = Clock::now(); - ET_LOG(Info, "%s running", method_name.c_str()); - Result> output_result = - module.execute(method_name, inputs); - ET_LOG(Info, "%s done", method_name.c_str()); - const auto run_end = Clock::now(); - timing.run_ms = DurationMs(run_end - run_start).count(); - - if (output_result.error() != Error::Ok) { - std::cerr << method_name << " execution failed: error code " - << static_cast(output_result.error()) << std::endl; - return output_result.error(); - } - - const auto& outputs = output_result.get(); - if (!outputs.empty() && outputs[0].isTensor()) { - print_tensor_summary(method_name + " output", outputs[0].toTensor()); - - if (category == MethodCategory::ENCODER || - category == MethodCategory::TOKEN_EMBEDDING) { - dump_tensor_to_file(method_name + "_output.bin", outputs[0].toTensor()); - } - - if (output_storage) { - *output_storage = outputs[0]; - } - } - - return Error::Ok; -} - -} // namespace - -int main(int argc, char** argv) { - if (argc != 4) { - std::cerr - << "Usage: " << argv[0] - << " " - << std::endl; - std::cerr << " model_name: gemma3 or voxtral" << std::endl; - return 1; - } - - const std::string model_name = argv[1]; - const std::string program_path = argv[2]; - const std::string data_map_path = argv[3]; - - const ModelType model_type = parse_model_type(model_name); - if (model_type == ModelType::UNKNOWN) { - std::cerr << "Unknown model type: " << model_name << std::endl; - std::cerr << "Supported models: gemma3, voxtral" << std::endl; - return 1; - } - - const ModelConfig& config = model_configs.at(model_type); - std::cout << "Running benchmark for model: " << config.name << std::endl; - - try { - Module module(program_path, data_map_path); - - const auto program_load_start = Clock::now(); - const Error program_load_error = module.load(); - const auto program_load_end = Clock::now(); - if (program_load_error != Error::Ok) { - std::cerr << "Failed to load ExecuTorch program: error code " - << static_cast(program_load_error) << std::endl; - return 1; - } - const DurationMs program_load_latency = - program_load_end - program_load_start; - - auto method_names_result = module.method_names(); - if (method_names_result.error() != Error::Ok) { - std::cerr << "Failed to get method names: error code " - << static_cast(method_names_result.error()) << std::endl; - return 1; - } - - const auto& available_methods = method_names_result.get(); - - std::cout << "Checking for expected methods..." << std::endl; - std::vector missing_methods; - for (const auto& expected : config.expected_methods) { - if (available_methods.find(expected) == available_methods.end()) { - missing_methods.push_back(expected); - } else { - std::cout << " ✓ " << expected << std::endl; - } - } - - if (!missing_methods.empty()) { - std::cerr << "\nError: Missing expected methods:" << std::endl; - for (const auto& missing : missing_methods) { - std::cerr << " ✗ " << missing << std::endl; - } - return 1; - } - - std::map timings; - EValue token_output; - bool token_executed = false; - - for (const auto& method_name : config.expected_methods) { - MethodCategory category = categorize_method(method_name); - MethodTiming timing; - - const EValue* input_token_ptr = - (category == MethodCategory::TEXT_DECODER && token_executed) - ? &token_output - : nullptr; - - EValue* output_storage = (category == MethodCategory::TOKEN_EMBEDDING) - ? &token_output - : nullptr; - - Error err = execute_method( - module, - method_name, - category, - model_type, - config, - input_token_ptr, - timing, - output_storage); - - if (err != Error::Ok) { - return 1; - } - - if (category == MethodCategory::TOKEN_EMBEDDING) { - token_executed = true; - } - - timings[method_name] = timing; - } - - std::cout << std::fixed << std::setprecision(3); - std::cout << "\n=== Benchmark Results ===" << std::endl; - std::cout << "Program load latency (ms): " << program_load_latency.count() - << std::endl; - - std::cout << "\nMethod load latency (ms):" << std::endl; - for (const auto& [name, timing] : timings) { - std::cout << " " << name << ": " << timing.load_ms << std::endl; - } - - std::cout << "\nRun latency (ms):" << std::endl; - for (const auto& [name, timing] : timings) { - std::cout << " " << name << ": " << timing.run_ms << std::endl; - } - - return 0; - } catch (const std::exception& ex) { - std::cerr << "Unhandled exception: " << ex.what() << std::endl; - return 1; - } -}