diff --git a/.ci/scripts/cuda_benchmark.py b/.ci/scripts/cuda_benchmark.py
new file mode 100644
index 00000000000..b135925d4b4
--- /dev/null
+++ b/.ci/scripts/cuda_benchmark.py
@@ -0,0 +1,939 @@
+"""
+Benchmark script for CUDA model runners.
+Runs model runner commands multiple times and collects performance metrics.
+Supports whisper, voxtral, gemma3, and other CUDA models.
+"""
+
+import argparse
+import json
+import statistics
+import subprocess
+import sys
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+
+@dataclass
+class RunMetrics:
+    """Metrics from a single run."""
+
+    generated_tokens: int
+    tokens_per_sec: float
+    model_load_time_ms: float
+    total_inference_time_ms: float
+    encoder_time_ms: float
+    generation_time_ms: float
+    first_token_latency_ms: float
+
+    def __repr__(self):
+        return (
+            f"Tokens: {self.generated_tokens}, "
+            f"Throughput: {self.tokens_per_sec:.2f} t/s, "
+            f"Model load: {self.model_load_time_ms:.0f}ms, "
+            f"Total inference: {self.total_inference_time_ms:.0f}ms, "
+            f"Encoder: {self.encoder_time_ms:.0f}ms, "
+            f"Generation: {self.generation_time_ms:.0f}ms, "
+            f"First token: {self.first_token_latency_ms:.0f}ms"
+        )
+
+
+def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
+    """Parse PyTorchObserver JSON output and compute metrics."""
+    try:
+        # Find the JSON part in the log line
+        if "PyTorchObserver" not in log_line:
+            return None
+
+        json_str = log_line.split("PyTorchObserver")[1].strip()
+        data = json.loads(json_str)
+
+        # Extract values
+        generated_tokens = data.get("generated_tokens", 0)
+        inference_start_ms = data.get("inference_start_ms", 0)
+        inference_end_ms = data.get("inference_end_ms", 0)
+        prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0)
+        first_token_ms = data.get("first_token_ms", 0)
+        model_load_start_ms = data.get("model_load_start_ms", 0)
+        model_load_end_ms = data.get("model_load_end_ms", 0)
+
+        # Compute metrics
+        # Total inference time: from inference start to inference end
+        total_inference_time_ms = inference_end_ms - inference_start_ms
+
+        # Encoder time: from inference start to prompt evaluation end
+        encoder_time_ms = prompt_eval_end_ms - inference_start_ms
+
+        # Generation time: from prompt evaluation end to inference end
+        generation_time_ms = inference_end_ms - prompt_eval_end_ms
+
+        # Calculate throughput based on generation time
+        tokens_per_sec = (
+            (generated_tokens / generation_time_ms * 1000)
+            if generation_time_ms > 0
+            else 0
+        )
+        model_load_time_ms = model_load_end_ms - model_load_start_ms
+        first_token_latency_ms = first_token_ms - prompt_eval_end_ms
+
+        return RunMetrics(
+            generated_tokens=generated_tokens,
+            tokens_per_sec=tokens_per_sec,
+            model_load_time_ms=model_load_time_ms,
+            total_inference_time_ms=total_inference_time_ms,
+            encoder_time_ms=encoder_time_ms,
+            generation_time_ms=generation_time_ms,
+            first_token_latency_ms=first_token_latency_ms,
+        )
+    except (json.JSONDecodeError, KeyError, ValueError) as e:
+        print(f"Error parsing PyTorchObserver log: {e}", file=sys.stderr)
+        return None
+
+
+def get_gpu_clocks() -> Optional[Tuple[str, str]]:
+    """Get current GPU and memory clock frequencies."""
+    try:
+        # Get GPU clock
+        result_gpu = subprocess.run(
+            [
+                "nvidia-smi",
+                "--query-gpu=clocks.gr",
+                "--format=csv,noheader,nounits",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        # Get memory clock
+        result_mem = subprocess.run(
+            [
+                "nvidia-smi",
+                "--query-gpu=clocks.mem",
+                "--format=csv,noheader,nounits",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+
+        if result_gpu.returncode == 0 and result_mem.returncode == 0:
+            gpu_clock = result_gpu.stdout.strip().split("\n")[0]
+            mem_clock = result_mem.stdout.strip().split("\n")[0]
+            return gpu_clock, mem_clock
+    except Exception as e:
+        print(f"Warning: Failed to get GPU clocks: {e}", file=sys.stderr)
+    return None
+
+
+def set_gpu_clocks(gpu_clock: Optional[int] = None) -> bool:
+    """
+    Set GPU clock frequency to a fixed value.
+
+    Args:
+        gpu_clock: Target GPU clock frequency in MHz.
+                   If None, will use max available.
+
+    Returns:
+        True if successful, False otherwise
+    """
+    try:
+        print("\n[GPU Clock Setup] Fixing GPU clock frequency...")
+
+        # Enable persistence mode
+        result = subprocess.run(
+            ["sudo", "nvidia-smi", "-pm", "1"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        if result.returncode != 0:
+            print(
+                f"Warning: Failed to enable persistence mode: {result.stderr}",
+                file=sys.stderr,
+            )
+            return False
+        print("✓ Enabled persistence mode")
+
+        # Lock GPU clocks
+        if gpu_clock is None:
+            # Get max GPU clock
+            result = subprocess.run(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=clocks.max.gr",
+                    "--format=csv,noheader,nounits",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if result.returncode == 0:
+                gpu_clock = int(result.stdout.strip().split("\n")[0])
+                print(f"✓ Detected max GPU clock: {gpu_clock} MHz")
+
+        # Lock GPU clock to the target frequency
+        result = subprocess.run(
+            ["sudo", "nvidia-smi", "-lgc", f"{gpu_clock},{gpu_clock}"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        if result.returncode != 0:
+            print(
+                f"Warning: Failed to lock GPU clock: {result.stderr}",
+                file=sys.stderr,
+            )
+            return False
+
+        print(f"✓ Locked GPU clock to {gpu_clock} MHz")
+        return True
+
+    except Exception as e:
+        print(f"Error: Failed to set GPU clocks: {e}", file=sys.stderr)
+        return False
+
+
+def reset_gpu_clocks() -> bool:
+    """Reset GPU clock frequencies to default."""
+    try:
+        print("\n[GPU Clock Cleanup] Resetting GPU clock frequency...")
+
+        # Reset GPU clocks
+        result = subprocess.run(
+            ["sudo", "nvidia-smi", "-rgc"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        if result.returncode != 0:
+            print(
+                f"Warning: Failed to reset GPU clock: {result.stderr}",
+                file=sys.stderr,
+            )
+            return False
+        print("✓ Reset GPU clock to default")
+
+        # Disable persistence mode
+        result = subprocess.run(
+            ["sudo", "nvidia-smi", "-pm", "0"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        if result.returncode != 0:
+            print(
+                "Warning: Failed to disable persistence mode: " f"{result.stderr}",
+                file=sys.stderr,
+            )
+            return False
+        print("✓ Disabled persistence mode")
+
+        return True
+
+    except Exception as e:
+        print(f"Error: Failed to reset GPU clocks: {e}", file=sys.stderr)
+        return False
+
+
+def _print_warmup_info(warmup_runs: int) -> None:
+    """Print warmup phase information."""
+    if warmup_runs > 0:
+        print(f"\n{'='*70}")
+        print(f"WARMUP PHASE: Running {warmup_runs} warmup iterations...")
+        print(f"{'='*70}")
+
+
+def _print_benchmark_info(
+    actual_benchmark_runs: int, trim_count: int, num_runs: int
+) -> None:
+    """Print benchmark phase information."""
+    print(f"\n{'='*70}")
+    print(f"BENCHMARK PHASE: Running {actual_benchmark_runs} iterations")
+    print(f"Will trim top and bottom {trim_count} results (10% of {num_runs})")
+    print(f"Final statistics will be based on middle {num_runs} results")
+    print(f"{'='*70}")
+
+
+def _run_single_iteration(
+    command: str, run_num: int, verbose: bool
+) -> Optional[RunMetrics]:
+    """
+    Run a single benchmark iteration and return metrics.
+
+    Args:
+        command: Command to execute
+        run_num: Current run number
+        verbose: Print verbose output
+
+    Returns:
+        RunMetrics if successful, None otherwise
+    """
+    try:
+        # Run command and capture output
+        result = subprocess.run(
+            command,
+            shell=True,
+            capture_output=True,
+            text=True,
+            timeout=300,  # 5 minute timeout
+        )
+
+        if result.returncode != 0:
+            print(
+                f"Error: Command failed with return code {result.returncode}",
+                file=sys.stderr,
+            )
+            if result.stderr:
+                print(f"stderr: {result.stderr}", file=sys.stderr)
+            return None
+
+        # Search for PyTorchObserver line in output
+        observer_line = None
+        for line in result.stdout.split("\n"):
+            if "PyTorchObserver" in line:
+                observer_line = line
+                break
+
+        if observer_line is None:
+            print(
+                f"Warning: No PyTorchObserver output found in run {run_num}",
+                file=sys.stderr,
+            )
+            if verbose:
+                print(f"stdout:\n{result.stdout}", file=sys.stderr)
+            return None
+
+        # Parse and return metrics
+        metrics = parse_pytorch_observer_log(observer_line)
+        if metrics is None:
+            print(
+                f"Warning: Failed to parse metrics from run {run_num}",
+                file=sys.stderr,
+            )
+            return None
+
+        print(f"✓ {metrics}")
+        return metrics
+
+    except subprocess.TimeoutExpired:
+        print(f"Error: Command timed out on run {run_num}", file=sys.stderr)
+        return None
+    except Exception as e:
+        print(f"Error on run {run_num}: {e}", file=sys.stderr)
+        return None
+
+
+def run_model_benchmark(
+    command: str,
+    num_runs: int = 5,
+    warmup_runs: int = 0,
+    verbose: bool = False,
+) -> List[RunMetrics]:
+    """
+    Run the model runner command multiple times and collect metrics.
+
+    For trimmed mean calculation, this function runs extra iterations
+    to ensure we can trim outliers. Based on num_runs, we calculate
+    trim_count = num_runs * 0.1, then run num_runs + 2*trim_count total
+    iterations. The top and bottom trim_count results will be discarded.
+
+    Args:
+        command: Full command to run
+        num_runs: Number of benchmark runs requested by user (after trim)
+        warmup_runs: Number of warmup runs (results will be discarded)
+        verbose: Print detailed output
+
+    Returns:
+        List of RunMetrics from benchmark runs (excluding warmup).
+    """
+    # Calculate trim count and total runs
+    trim_count = int(num_runs * 0.1)
+    actual_benchmark_runs = num_runs + 2 * trim_count
+    total_runs = warmup_runs + actual_benchmark_runs
+
+    # Print phase information
+    _print_warmup_info(warmup_runs)
+    _print_benchmark_info(actual_benchmark_runs, trim_count, num_runs)
+
+    # Execute all runs
+    results = []
+    for run_num in range(1, total_runs + 1):
+        is_warmup = run_num <= warmup_runs
+        phase = "Warmup" if is_warmup else "Benchmark"
+        benchmark_run_num = run_num - warmup_runs if not is_warmup else run_num
+
+        # Print run header
+        if is_warmup:
+            print(f"\n[{phase} {run_num}/{warmup_runs}] Executing: {command}")
+        else:
+            print(
+                f"\n[{phase} {benchmark_run_num}/{actual_benchmark_runs}] "
+                f"Executing: {command}"
+            )
+
+        # Run iteration and collect metrics
+        metrics = _run_single_iteration(command, run_num, verbose)
+        if metrics is not None and not is_warmup:
+            results.append(metrics)
+
+    return results
+
+
+def calculate_trimmed_stats(
+    values: List[float], trim_count: int
+) -> Tuple[List[float], float, float, float, float]:
+    """
+    Calculate statistics on trimmed data.
+
+    Args:
+        values: List of numeric values
+        trim_count: Number of values to trim from each end
+
+    Returns:
+        Tuple of (trimmed_values, min, max, mean, stdev)
+    """
+    if not values:
+        return [], 0.0, 0.0, 0.0, 0.0
+
+    # Sort values
+    sorted_values = sorted(values)
+    n = len(sorted_values)
+
+    # Trim if we have enough data and trim_count > 0
+    if trim_count > 0 and n > 2 * trim_count:
+        trimmed_values = sorted_values[trim_count : n - trim_count]
+    else:
+        trimmed_values = sorted_values
+
+    # Calculate stats on trimmed data
+    min_val = min(trimmed_values)
+    max_val = max(trimmed_values)
+    mean_val = statistics.mean(trimmed_values)
+    stdev_val = statistics.stdev(trimmed_values) if len(trimmed_values) > 1 else 0.0
+
+    return trimmed_values, min_val, max_val, mean_val, stdev_val
+
+
+@dataclass
+class MetricStats:
+    """Statistics for a single metric with operations."""
+
+    name: str
+    mean: float
+    min_val: float
+    max_val: float
+    stdev: float
+    unit: str = ""
+    extra_info: dict | None = None
+
+    def create_v3_record(
+        self,
+        model_name: str,
+        backend: str,
+        runner_name: str,
+        runner_type: str,
+        base_extra_info: dict,
+    ) -> dict:
+        """
+        Create a v3 format record for this metric.
+
+        Args:
+            model_name: Model name with quantization
+            backend: Backend name (e.g., "cuda-aoti")
+            runner_name: GPU device name
+            runner_type: CUDA driver version
+            base_extra_info: Base extra_info dict to copy
+
+        Returns:
+            Complete v3 format metric record
+        """
+        extra_stats = {
+            "min": self.min_val,
+            "max": self.max_val,
+            "stdev": self.stdev,
+        }
+        if self.extra_info:
+            extra_stats.update(self.extra_info)
+
+        return {
+            "benchmark": {
+                "name": "ExecuTorch",
+                "mode": "inference",
+                "extra_info": base_extra_info.copy(),
+            },
+            "model": {
+                "name": model_name,
+                "type": "OSS model",
+                "backend": backend,
+            },
+            "metric": {
+                "name": self.name,
+                "benchmark_values": [self.mean],
+                "target_value": 0,
+                "extra_info": extra_stats,
+            },
+            "runners": [{"name": runner_name, "type": runner_type}],
+        }
+
+    def print_stats(self) -> None:
+        """Print formatted statistics for this metric."""
+        # Determine precision based on metric type
+        is_throughput = "tokens" in self.name.lower()
+        precision = 2 if is_throughput else 0
+
+        # Format metric name for display
+        display_name = self.name.replace("_", " ").upper()
+        if self.unit:
+            display_name = f"{display_name} ({self.unit})"
+
+        print(f"{display_name}:")
+        print(f"  Min:    {self.min_val:.{precision}f} {self.unit}")
+        print(f"  Max:    {self.max_val:.{precision}f} {self.unit}")
+        print(f"  Mean:   {self.mean:.{precision}f} {self.unit}")
+        print(f"  Stdev:  {self.stdev:.{precision}f} {self.unit}")
+        print()
+
+
+@dataclass
+class BenchmarkResults:
+    """Summary of benchmark results."""
+
+    model_name: str
+    total_runs: int
+    trimmed_runs: int
+    discarded_runs: int
+    generated_tokens: int
+
+    # Metrics
+    throughput: MetricStats
+    model_load_time: MetricStats
+    total_inference_time: MetricStats
+    encoder_time: MetricStats
+    generation_time: MetricStats
+    first_token_latency: MetricStats
+
+    def save_json(self, output_path: str) -> None:
+        """Save results to JSON file."""
+        with open(output_path, "w") as f:
+            json.dump(self.to_dict(), f, indent=2)
+        print(f"\n✓ Results saved to: {output_path}")
+
+    def to_dict(self) -> dict:
+        """Convert results to dictionary for JSON serialization."""
+        return {
+            "model_name": self.model_name,
+            "total_runs": self.total_runs,
+            "trimmed_runs": self.trimmed_runs,
+            "discarded_runs": self.discarded_runs,
+            "generated_tokens": self.generated_tokens,
+            "throughput_mean": self.throughput.mean,
+            "throughput_min": self.throughput.min_val,
+            "throughput_max": self.throughput.max_val,
+            "throughput_stdev": self.throughput.stdev,
+            "model_load_time_mean": self.model_load_time.mean,
+            "model_load_time_min": self.model_load_time.min_val,
+            "model_load_time_max": self.model_load_time.max_val,
+            "model_load_time_stdev": self.model_load_time.stdev,
+            "total_inference_time_mean": self.total_inference_time.mean,
+            "total_inference_time_min": self.total_inference_time.min_val,
+            "total_inference_time_max": self.total_inference_time.max_val,
+            "total_inference_time_stdev": self.total_inference_time.stdev,
+            "encoder_time_mean": self.encoder_time.mean,
+            "encoder_time_min": self.encoder_time.min_val,
+            "encoder_time_max": self.encoder_time.max_val,
+            "encoder_time_stdev": self.encoder_time.stdev,
+            "generation_time_mean": self.generation_time.mean,
+            "generation_time_min": self.generation_time.min_val,
+            "generation_time_max": self.generation_time.max_val,
+            "generation_time_stdev": self.generation_time.stdev,
+            "first_token_latency_mean": self.first_token_latency.mean,
+            "first_token_latency_min": self.first_token_latency.min_val,
+            "first_token_latency_max": self.first_token_latency.max_val,
+            "first_token_latency_stdev": self.first_token_latency.stdev,
+        }
+
+    def to_v3_format(
+        self,
+        model: str,
+        quantization: str,
+        git_sha: str,
+        workflow_run_id: str,
+        workflow_run_url: str = "",
+        gpu_name: str = "CUDA",
+        cuda_driver_version: str = "cuda",
+    ) -> List[dict]:
+        """
+        Transform benchmark results to PyTorch benchmark database v3 format.
+
+        Args:
+            model: Model name (e.g., "openai/whisper-small")
+            quantization: Quantization type (e.g., "non-quantized")
+            git_sha: Git commit SHA
+            workflow_run_id: GitHub workflow run ID
+            workflow_run_url: GitHub workflow run URL
+            gpu_name: GPU device name (e.g., "Tesla V100", "A100")
+            cuda_driver_version: CUDA driver version (e.g., "12.6", "535.104.05")
+
+        Returns:
+            List of benchmark records in v3 format
+        """
+        # Shared configuration
+        model_name_with_quant = f"{model}_{quantization}"
+        backend = "cuda-aoti"
+        runner_name = gpu_name
+        runner_type = cuda_driver_version
+
+        # Create base extra_info
+        base_extra_info = {
+            "backend": "cuda",
+            "quantization": quantization,
+            "git_sha": git_sha,
+            "workflow_run_id": workflow_run_id,
+        }
+        if workflow_run_url:
+            base_extra_info["workflow_run_url"] = workflow_run_url
+
+        # Create v3 records for all metrics
+        return [
+            self.throughput.create_v3_record(
+                model_name_with_quant,
+                backend,
+                runner_name,
+                runner_type,
+                base_extra_info,
+            ),
+            self.model_load_time.create_v3_record(
+                model_name_with_quant,
+                backend,
+                runner_name,
+                runner_type,
+                base_extra_info,
+            ),
+            self.total_inference_time.create_v3_record(
+                model_name_with_quant,
+                backend,
+                runner_name,
+                runner_type,
+                base_extra_info,
+            ),
+            self.encoder_time.create_v3_record(
+                model_name_with_quant,
+                backend,
+                runner_name,
+                runner_type,
+                base_extra_info,
+            ),
+            self.generation_time.create_v3_record(
+                model_name_with_quant,
+                backend,
+                runner_name,
+                runner_type,
+                base_extra_info,
+            ),
+            self.first_token_latency.create_v3_record(
+                model_name_with_quant,
+                backend,
+                runner_name,
+                runner_type,
+                base_extra_info,
+            ),
+        ]
+
+
+def compute_summary(
+    model_name: str, results: List[RunMetrics], requested_runs: int
+) -> BenchmarkResults:
+    """
+    Compute summary statistics using trimmed data.
+
+    All statistics (min, max, mean, stdev) are calculated based on
+    the trimmed dataset after removing outliers.
+
+    Args:
+        model_name: Name of the model being benchmarked
+        results: List of all collected run metrics
+        requested_runs: Number of runs originally requested by user
+
+    Returns:
+        BenchmarkResults object with all computed statistics
+    """
+    if not results:
+        raise ValueError("No valid results to summarize.")
+
+    # Calculate trim count based on requested runs (not actual runs)
+    trim_count = int(requested_runs * 0.1)
+
+    # Helper to create MetricStats from values
+    def create_metric_stats(
+        name: str, values: List[float], unit: str = "", extra_info: dict | None = None
+    ) -> MetricStats:
+        _, min_val, max_val, mean_val, stdev_val = calculate_trimmed_stats(
+            values, trim_count
+        )
+        return MetricStats(
+            name=name,
+            mean=mean_val,
+            min_val=min_val,
+            max_val=max_val,
+            stdev=stdev_val,
+            unit=unit,
+            extra_info=extra_info,
+        )
+
+    # Get the first trimmed result to get trimmed_runs count
+    trimmed_throughput, _, _, _, _ = calculate_trimmed_stats(
+        [r.tokens_per_sec for r in results], trim_count
+    )
+
+    return BenchmarkResults(
+        model_name=model_name,
+        total_runs=len(results),
+        trimmed_runs=len(trimmed_throughput),
+        discarded_runs=trim_count * 2,
+        generated_tokens=results[0].generated_tokens,
+        throughput=create_metric_stats(
+            "throughput(tokens/sec)",
+            [r.tokens_per_sec for r in results],
+            "t/s",
+            {"trimmed_runs": len(trimmed_throughput)},
+        ),
+        model_load_time=create_metric_stats(
+            "model_load_time(ms)",
+            [r.model_load_time_ms for r in results],
+            "ms",
+        ),
+        total_inference_time=create_metric_stats(
+            "total_inference_time(ms)",
+            [r.total_inference_time_ms for r in results],
+            "ms",
+        ),
+        encoder_time=create_metric_stats(
+            "encoder_time(ms)",
+            [r.encoder_time_ms for r in results],
+            "ms",
+        ),
+        generation_time=create_metric_stats(
+            "generation_time(ms)",
+            [r.generation_time_ms for r in results],
+            "ms",
+        ),
+        first_token_latency=create_metric_stats(
+            "first_token_latency(ms)",
+            [r.first_token_latency_ms for r in results],
+            "ms",
+        ),
+    )
+
+
+def print_summary(summary: BenchmarkResults) -> None:
+    """Print formatted summary of benchmark results."""
+    print("\n" + "=" * 70)
+    print(f"BENCHMARK SUMMARY for model: {summary.model_name}")
+    print("=" * 70)
+    print(f"Total runs collected: {summary.total_runs}")
+    print(f"Trimmed to: {summary.trimmed_runs} runs")
+    print(
+        f"(Discarded {summary.discarded_runs // 2} highest and "
+        f"{summary.discarded_runs // 2} lowest results)"
+    )
+    print(f"Generated tokens per run: {summary.generated_tokens}")
+    print()
+
+    # Print all metrics using their print_stats method
+    summary.throughput.print_stats()
+    summary.model_load_time.print_stats()
+    summary.total_inference_time.print_stats()
+    summary.encoder_time.print_stats()
+    summary.generation_time.print_stats()
+    summary.first_token_latency.print_stats()
+
+    print("=" * 70)
+
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(
+        description="Benchmark CUDA model runners and collect performance metrics"
+    )
+    parser.add_argument(
+        "--runner_command",
+        type=str,
+        required=True,
+        help="Full command to run the model runner",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=True,
+        help="Name of the model being benchmarked",
+    )
+    parser.add_argument(
+        "--num_runs",
+        type=int,
+        default=50,
+        help="Number of benchmark runs (default: 50)",
+    )
+    parser.add_argument(
+        "--warmup_runs",
+        type=int,
+        default=0,
+        help="Number of warmup runs before benchmark (default: 0.1 * num_runs)",
+    )
+    parser.add_argument(
+        "--fix_gpu_clock",
+        type=bool,
+        default=True,
+        help="Fix GPU clock frequency to maximum before benchmarking",
+    )
+    parser.add_argument(
+        "--gpu_clock",
+        type=int,
+        default=None,
+        help="Target GPU clock frequency in MHz (requires "
+        "--fix_gpu_clock). If not specified, uses max available.",
+    )
+    parser.add_argument(
+        "--output_json",
+        type=str,
+        default=None,
+        help="Path to save JSON results",
+    )
+    parser.add_argument(
+        "--output_v3",
+        type=str,
+        default=None,
+        help="Path to save v3 format JSON results for dashboard",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="Model ID (e.g., 'openai/whisper-small') - required for v3 format",
+    )
+    parser.add_argument(
+        "--quantization",
+        type=str,
+        default=None,
+        help="Quantization type (e.g., 'non-quantized') - required for v3 format",
+    )
+    parser.add_argument(
+        "--git_sha",
+        type=str,
+        default=None,
+        help="Git commit SHA - required for v3 format",
+    )
+    parser.add_argument(
+        "--workflow_run_id",
+        type=str,
+        default=None,
+        help="GitHub workflow run ID - required for v3 format",
+    )
+    parser.add_argument(
+        "--workflow_run_url",
+        type=str,
+        default="",
+        help="GitHub workflow run URL - optional for v3 format",
+    )
+    parser.add_argument(
+        "--gpu_name",
+        type=str,
+        default=None,
+        help="GPU device name (e.g., 'Tesla V100', 'A100') - optional for v3 format",
+    )
+    parser.add_argument(
+        "--cuda_driver_version",
+        type=str,
+        default=None,
+        help="CUDA driver version (e.g., '12.6', '535.104.05') - optional for v3 format",
+    )
+    parser.add_argument("--verbose", action="store_true", help="Print verbose output")
+
+    args = parser.parse_args()
+
+    warmup_runs = (
+        int(0.1 * args.num_runs) if args.warmup_runs == 0 else args.warmup_runs
+    )
+
+    print(f"Running benchmark for model: {args.model_name}")
+    print(f"Number of runs: {args.num_runs}")
+    if warmup_runs > 0:
+        print(f"Warmup runs: {warmup_runs}")
+    if args.fix_gpu_clock:
+        clock_str = f"{args.gpu_clock}" if args.gpu_clock else "max available"
+        print(f"GPU clock will be fixed to: {clock_str} MHz")
+    print(f"Command: {args.runner_command}\n")
+
+    # Fix GPU clocks if requested
+    gpu_clock_fixed = False
+    if args.fix_gpu_clock:
+        # Get current clocks before fixing
+        initial_clocks = get_gpu_clocks()
+        if initial_clocks:
+            print(
+                f"Current GPU clocks - GPU: {initial_clocks[0]} MHz, "
+                f"Memory: {initial_clocks[1]} MHz"
+            )
+
+        gpu_clock_fixed = set_gpu_clocks(args.gpu_clock)
+        if not gpu_clock_fixed:
+            print(
+                "Warning: Failed to fix GPU clocks. "
+                "Continuing without fixed clocks...",
+                file=sys.stderr,
+            )
+
+    try:
+        # Run benchmark
+        results = run_model_benchmark(
+            command=args.runner_command,
+            num_runs=args.num_runs,
+            warmup_runs=warmup_runs,
+            verbose=args.verbose,
+        )
+
+        # Compute and print summary
+        summary = compute_summary(args.model_name, results, args.num_runs)
+        print_summary(summary)
+
+        # Save JSON results if requested
+        if args.output_json:
+            summary.save_json(args.output_json)
+
+        # Save v3 format if requested
+        if args.output_v3:
+            # Validate required parameters for v3 format
+            if not all(
+                [args.model, args.quantization, args.git_sha, args.workflow_run_id]
+            ):
+                print(
+                    "Error: --output_v3 requires --model, --quantization, "
+                    "--git_sha, and --workflow_run_id",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+
+            v3_records = summary.to_v3_format(
+                model=args.model,
+                quantization=args.quantization,
+                git_sha=args.git_sha,
+                workflow_run_id=args.workflow_run_id,
+                workflow_run_url=args.workflow_run_url,
+                gpu_name=args.gpu_name if args.gpu_name else "UNKNOWN GPU",
+                cuda_driver_version=(
+                    args.cuda_driver_version if args.cuda_driver_version else "cuda"
+                ),
+            )
+
+            with open(args.output_v3, "w") as f:
+                json.dump(v3_records, f, indent=2)
+
+            print(f"✓ v3 format results saved to: {args.output_v3}")
+            print(f"✓ Generated {len(v3_records)} v3 records for dashboard upload")
+
+    finally:
+        # Reset GPU clocks if they were fixed
+        if gpu_clock_fixed:
+            reset_gpu_clocks()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/trigger_cuda_perf.sh b/.github/scripts/trigger_cuda_perf.sh
new file mode 100755
index 00000000000..402dd009673
--- /dev/null
+++ b/.github/scripts/trigger_cuda_perf.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Quick script to trigger cuda-perf workflow via GitHub CLI
+# Usage:
+#   ./trigger_cuda_perf.sh                                          # Use defaults (random model + quant)
+#   ./trigger_cuda_perf.sh --all                                     # Run ALL models with ALL quantizations
+#   ./trigger_cuda_perf.sh "openai/whisper-medium"                  # Single model
+#   ./trigger_cuda_perf.sh "openai/whisper-small,google/gemma-3-4b-it" "non-quantized,quantized-int4-tile-packed" "100"
+
+set -e
+
+# All available models and quantizations
+ALL_MODELS="mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it"
+ALL_QUANTIZATIONS="non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only"
+
+# Check if gh CLI is installed
+if ! command -v gh &> /dev/null; then
+    echo "Error: GitHub CLI (gh) is not installed."
+    echo "Install it from: https://cli.github.com/"
+    echo ""
+    echo "Quick install:"
+    echo "  macOS:   brew install gh"
+    echo "  Linux:   See https://github.com/cli/cli/blob/trunk/docs/install_linux.md"
+    exit 1
+fi
+
+# Check for --all flag
+RUN_ALL=false
+if [ "${1:-}" = "--all" ] || [ "${1:-}" = "-a" ]; then
+    RUN_ALL=true
+    shift  # Remove the flag from arguments
+fi
+
+# Default parameters
+if [ "$RUN_ALL" = true ]; then
+    MODELS="$ALL_MODELS"
+    QUANT="$ALL_QUANTIZATIONS"
+    NUM_RUNS="${1:-50}"
+    RANDOM_MODEL="false"
+    echo "========================================="
+    echo "Triggering cuda-perf workflow"
+    echo "Mode: RUN ALL MODELS AND QUANTIZATIONS"
+    echo "========================================="
+    echo "Models:         ALL (5 models)"
+    echo "Quantizations:  ALL (3 quantizations)"
+    echo "Total configs:  15 combinations"
+    echo "Num runs:       $NUM_RUNS"
+    echo "========================================="
+else
+    MODELS="${1:-}"
+    QUANT="${2:-}"
+    NUM_RUNS="${3:-50}"
+    RANDOM_MODEL="${4:-false}"
+
+    # Display configuration
+    echo "========================================="
+    echo "Triggering cuda-perf workflow"
+    echo "========================================="
+    if [ -z "$MODELS" ]; then
+        echo "Models:         (random selection)"
+    else
+        echo "Models:         $MODELS"
+    fi
+    if [ -z "$QUANT" ]; then
+        echo "Quantizations:  (random selection)"
+    else
+        echo "Quantizations:  $QUANT"
+    fi
+    echo "Num runs:       $NUM_RUNS"
+    echo "Random model:   $RANDOM_MODEL"
+    echo "========================================="
+fi
+
+echo ""
+
+# Trigger workflow
+gh workflow run cuda-perf.yml \
+  -R pytorch/executorch \
+  -f models="$MODELS" \
+  -f quantizations="$QUANT" \
+  -f num_runs="$NUM_RUNS" \
+  -f random_model="$RANDOM_MODEL"
+
+if [ $? -eq 0 ]; then
+    echo "✓ Workflow triggered successfully!"
+    echo ""
+    echo "View status:"
+    echo "  gh run list --workflow=cuda-perf.yml"
+    echo ""
+    echo "Watch the latest run:"
+    echo "  gh run watch \$(gh run list --workflow=cuda-perf.yml --limit 1 --json databaseId --jq '.[0].databaseId')"
+else
+    echo "✗ Failed to trigger workflow"
+    exit 1
+fi
diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
new file mode 100644
index 00000000000..13efd9eff84
--- /dev/null
+++ b/.github/workflows/cuda-perf.yml
@@ -0,0 +1,420 @@
+name: cuda-perf
+
+on:
+  schedule:
+    - cron: 0 8 * * *  # 1am PST (8am UTC)
+  pull_request:
+    paths:
+      - .github/workflows/cuda-perf.yml
+      - .ci/scripts/cuda_benchmark.py
+      - .ci/scripts/export_model_artifact.sh
+      - .ci/scripts/test_model_e2e.sh
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/cuda-perf.yml
+      - .ci/scripts/cuda_benchmark.py
+      - .ci/scripts/export_model_artifact.sh
+      - .ci/scripts/test_model_e2e.sh
+  workflow_dispatch:
+    inputs:
+      models:
+        description: Models to be benchmarked (comma-separated HuggingFace model IDs)
+        required: false
+        type: string
+        default: openai/whisper-small
+      quantizations:
+        description: Quantization types (comma-separated)
+        required: false
+        type: string
+        default: non-quantized
+      num_runs:
+        description: Number of benchmark runs per model
+        required: false
+        type: string
+        default: "50"
+      random_model:
+        description: Run a random model instead of all models
+        required: false
+        type: boolean
+        default: false
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  set-parameters:
+    runs-on: ubuntu-22.04
+    outputs:
+      benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'false'
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Set parameters
+        id: set-parameters
+        shell: bash
+        env:
+          # All available models and quantizations
+          ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it'
+          ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only'
+          NUM_RUNS: ${{ inputs.num_runs || '50' }}
+          RANDOM_MODEL: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && 'true' || inputs.random_model || 'false' }}
+        run: |
+          set -eux
+
+          MODELS="${{ inputs.models }}"
+          QUANTIZATIONS="${{ inputs.quantizations }}"
+
+          # For non-schedule events (PR, manual trigger without inputs), randomly select one model and one quantization
+          if [ -z "$MODELS" ] && [ "${{ github.event_name }}" != "schedule" ]; then
+            # Split all models into array
+            IFS=',' read -ra ALL_MODEL_ARRAY <<< "$ALL_MODELS"
+            # Randomly select one model
+            RANDOM_MODEL_INDEX=$((RANDOM % ${#ALL_MODEL_ARRAY[@]}))
+            MODELS="${ALL_MODEL_ARRAY[$RANDOM_MODEL_INDEX]}"
+            echo "Randomly selected model for PR/push: $MODELS"
+          elif [ -z "$MODELS" ]; then
+            # Schedule event: use all models
+            MODELS="$ALL_MODELS"
+          fi
+
+          if [ -z "$QUANTIZATIONS" ] && [ "${{ github.event_name }}" != "schedule" ]; then
+            # Split all quantizations into array
+            IFS=',' read -ra ALL_QUANT_ARRAY <<< "$ALL_QUANTIZATIONS"
+            # Randomly select one quantization
+            RANDOM_QUANT_INDEX=$((RANDOM % ${#ALL_QUANT_ARRAY[@]}))
+            QUANTIZATIONS="${ALL_QUANT_ARRAY[$RANDOM_QUANT_INDEX]}"
+            echo "Randomly selected quantization for PR/push: $QUANTIZATIONS"
+          elif [ -z "$QUANTIZATIONS" ]; then
+            # Schedule event: use all quantizations
+            QUANTIZATIONS="$ALL_QUANTIZATIONS"
+          fi
+
+          # Split models and quantizations into arrays
+          IFS=',' read -ra MODEL_ARRAY <<< "$MODELS"
+          IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS"
+
+          # If random model is requested (for main branch push), select one random model from the already selected models
+          if [ "$RANDOM_MODEL" = "true" ] && [ ${#MODEL_ARRAY[@]} -gt 1 ]; then
+            RANDOM_INDEX=$((RANDOM % ${#MODEL_ARRAY[@]}))
+            MODELS="${MODEL_ARRAY[$RANDOM_INDEX]}"
+            MODEL_ARRAY=("$MODELS")
+            echo "Random model selected for main branch push: $MODELS"
+          fi
+
+          # Generate benchmark configs
+          CONFIGS='{"include":['
+          FIRST=true
+          for MODEL in "${MODEL_ARRAY[@]}"; do
+            for QUANT in "${QUANT_ARRAY[@]}"; do
+              if [ "$FIRST" = true ]; then
+                FIRST=false
+              else
+                CONFIGS+=','
+              fi
+              # Sanitize model name for use in artifact paths
+              MODEL_SAFE=$(echo "$MODEL" | sed 's/\//_/g')
+              CONFIGS+="{\"model\":\"$MODEL\",\"quant\":\"$QUANT\",\"model_safe\":\"$MODEL_SAFE\",\"num_runs\":\"$NUM_RUNS\"}"
+            done
+          done
+          CONFIGS+=']}'
+
+          echo "benchmark_configs=$CONFIGS" >> $GITHUB_OUTPUT
+          echo "Generated benchmark configs:"
+          echo "$CONFIGS" | python -m json.tool
+
+  export-models:
+    name: export-models
+    needs: set-parameters
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
+      fail-fast: false
+    with:
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      use-custom-docker-registry: false
+      submodules: recursive
+      upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        echo "::group::Setup ExecuTorch"
+        ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]<1.0" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        echo "::endgroup::"
+
+        echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}"
+        OUTPUT_DIR="model_artifacts"
+        mkdir -p "$OUTPUT_DIR"
+
+        bash .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" "$OUTPUT_DIR"
+
+        # Move artifacts to RUNNER_ARTIFACT_DIR for upload
+        mv "$OUTPUT_DIR"/* "${RUNNER_ARTIFACT_DIR}/"
+        ls -lah "${RUNNER_ARTIFACT_DIR}"
+        echo "::endgroup::"
+
+  benchmark-cuda:
+    name: benchmark-cuda
+    needs:
+      - set-parameters
+      - export-models
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
+      upload-artifact: results-${{ matrix.model_safe }}-${{ matrix.quant }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        echo "::group::Setup environment"
+        ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare model artifacts"
+        mkdir -p model_artifacts
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" model_artifacts/model.pte
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" model_artifacts/aoti_cuda_blob.ptd
+
+        # Copy additional files if they exist
+        if [ -f "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" ]; then
+          cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" model_artifacts/
+        fi
+        if [ -f "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" ]; then
+          cp "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" model_artifacts/
+        fi
+        if [ -f "${RUNNER_ARTIFACT_DIR}/tekken.json" ]; then
+          cp "${RUNNER_ARTIFACT_DIR}/tekken.json" model_artifacts/
+        fi
+        if [ -f "${RUNNER_ARTIFACT_DIR}/poem.wav" ]; then
+          cp "${RUNNER_ARTIFACT_DIR}/poem.wav" model_artifacts/
+        fi
+        if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then
+          cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/
+        fi
+        # Copy tokenizer files
+        for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do
+          if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then
+            cp "${RUNNER_ARTIFACT_DIR}/$file" model_artifacts/
+          fi
+        done
+
+        ls -lah model_artifacts/
+        echo "::endgroup::"
+
+        echo "::group::Build runner"
+        bash .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" model_artifacts
+        echo "::endgroup::"
+
+        echo "::group::Running benchmark for ${{ matrix.model }} (${{ matrix.quant }}) with ${{ matrix.num_runs }} runs"
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+
+        # Get GPU name using nvidia-smi
+        GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
+        echo "Detected GPU: $GPU_NAME"
+
+        # Get CUDA driver version
+        CUDA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1)
+        echo "CUDA Driver Version: $CUDA_DRIVER_VERSION"
+
+        # Create results directory
+        RESULTS_DIR="${RUNNER_ARTIFACT_DIR}"
+        mkdir -p "$RESULTS_DIR"
+
+        # Determine model name and runner command based on model
+        case "${{ matrix.model }}" in
+          mistralai/Voxtral-Mini-3B-2507)
+            RUNNER="cmake-out/examples/models/voxtral/voxtral_runner"
+            PREPROCESSOR="model_artifacts/voxtral_preprocessor.pte"
+            TOKENIZER="model_artifacts/tekken.json"
+            AUDIO="model_artifacts/poem.wav"
+            RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0"
+            MODEL_NAME="voxtral_${{ matrix.quant }}"
+            ;;
+          openai/whisper-*)
+            RUNNER="cmake-out/examples/models/whisper/whisper_runner"
+            PREPROCESSOR="model_artifacts/whisper_preprocessor.pte"
+            AUDIO="model_artifacts/output.wav"
+            RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0"
+            MODEL_NAME=$(echo "${{ matrix.model }}" | sed 's/openai\///')_${{ matrix.quant }}
+            ;;
+          google/gemma-3-4b-it)
+            RUNNER="cmake-out/examples/models/gemma3/gemma3_e2e_runner"
+            IMAGE="docs/source/_static/img/et-logo.png"
+            RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0"
+            MODEL_NAME="gemma3_${{ matrix.quant }}"
+            ;;
+          *)
+            echo "Error: Unsupported model '${{ matrix.model }}'"
+            exit 1
+            ;;
+        esac
+
+        # Run benchmark using cuda_benchmark.py
+        python .ci/scripts/cuda_benchmark.py \
+          --runner_command "$RUNNER_CMD" \
+          --model_name "$MODEL_NAME" \
+          --num_runs "${{ matrix.num_runs }}" \
+          --output_json "$RESULTS_DIR/benchmark_results.json" \
+          --output_v3 "$RESULTS_DIR/benchmark_results_v3.json" \
+          --model "${{ matrix.model }}" \
+          --quantization "${{ matrix.quant }}" \
+          --git_sha "${{ github.sha }}" \
+          --workflow_run_id "${{ github.run_id }}" \
+          --workflow_run_url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
+          --gpu_name "$GPU_NAME" \
+          --cuda_driver_version "$CUDA_DRIVER_VERSION"
+
+        # Save additional metadata
+        cat > "$RESULTS_DIR/metadata.json" <<EOF
+        {
+          "model": "${{ matrix.model }}",
+          "quantization": "${{ matrix.quant }}",
+          "num_runs": ${{ matrix.num_runs }},
+          "runner": "$RUNNER",
+          "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
+          "git_sha": "${{ github.sha }}",
+          "workflow_run_id": "${{ github.run_id }}",
+          "workflow_run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        }
+        EOF
+        echo "::endgroup::"
+
+  upload-benchmark-results:
+    needs:
+      - benchmark-cuda
+    if: always()
+    runs-on: ubuntu-22.04
+    environment: upload-benchmark-results
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: false
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Download all benchmark results
+        uses: actions/download-artifact@v4
+        with:
+          pattern: results-*
+          path: all_results/
+
+      - name: Process and display results
+        shell: bash
+        run: |
+          set -eux
+          echo "::group::Benchmark Results Summary"
+
+          for RESULT_DIR in all_results/results-*/; do
+            if [ -f "$RESULT_DIR/benchmark_results.json" ]; then
+              echo ""
+              echo "================================"
+              echo "Results from: $(basename "$RESULT_DIR")"
+              echo "================================"
+
+              # Display benchmark results (mean performance)
+              cat "$RESULT_DIR/benchmark_results.json" | python -m json.tool
+
+              # Display metadata
+              if [ -f "$RESULT_DIR/metadata.json" ]; then
+                echo ""
+                echo "--- Metadata ---"
+                cat "$RESULT_DIR/metadata.json" | python -m json.tool
+              fi
+              echo ""
+            fi
+          done
+
+          echo "::endgroup::"
+
+      - name: Authenticate with AWS
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
+      - name: Upload to S3
+        shell: bash
+        env:
+          S3_BUCKET: gha-artifacts
+          S3_PREFIX: executorch-cuda-perf/${{ github.run_id }}/${{ github.run_attempt }}
+        run: |
+          set -eux
+          pip install awscli
+
+          echo "Uploading benchmark results to S3..."
+          aws s3 sync all_results/ "s3://${S3_BUCKET}/${S3_PREFIX}/" \
+            --exclude "*" \
+            --include "*.json" \
+            --include "*.log"
+
+          echo "Results uploaded to: s3://${S3_BUCKET}/${S3_PREFIX}/"
+
+      - name: Prepare v3 results for dashboard upload
+        shell: bash
+        run: |
+          set -eux
+          echo "::group::Prepare v3 results"
+
+          mkdir -p benchmark-results/v3
+
+          # Collect all v3 results into a single directory
+          for RESULT_DIR in all_results/results-*/; do
+            if [ -f "$RESULT_DIR/benchmark_results_v3.json" ]; then
+              # Generate unique filename based on directory name
+              FILENAME=$(basename "$RESULT_DIR")
+              cp "$RESULT_DIR/benchmark_results_v3.json" "benchmark-results/v3/${FILENAME}.json"
+              echo "✓ Copied $FILENAME v3 results"
+            fi
+          done
+
+          echo "V3 results prepared:"
+          ls -lah benchmark-results/v3/
+          echo "::endgroup::"
+
+      - name: Upload benchmark results to dashboard
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: benchmark-results/v3
+          dry-run: false
+          schema-version: v3
+          github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 1d237f5d8ef..c66d3621ea1 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -144,72 +144,6 @@ jobs:
 
         source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
 
-  benchmark-model-cuda:
-    name: benchmark-model-cuda
-    needs: export-model-cuda-artifact
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    strategy:
-      fail-fast: false
-      matrix:
-        model:
-          - repo: "mistralai"
-            name: "Voxtral-Mini-3B-2507"
-          - repo: "google"
-            name: "gemma-3-4b-it"
-        quant:
-          - "non-quantized"
-          - "quantized-int4-tile-packed"
-          - "quantized-int4-weight-only"
-        exclude:
-          # TODO: enable int4-weight-only on gemma3.
-          - model:
-              repo: "google"
-              name: "gemma-3-4b-it"
-            quant: "quantized-int4-weight-only"
-    with:
-      timeout: 90
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: 12.6
-      use-custom-docker-registry: false
-      submodules: recursive
-      download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      script: |
-        set -eux
-
-        echo "::group::Setup ExecuTorch Requirements"
-        ./install_requirements.sh
-        pip list
-        echo "::endgroup::"
-
-        echo "::group::Prepare ${{ matrix.model }} Artifacts"
-        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
-        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
-        ls -al model.pte aoti_cuda_blob.ptd
-        echo "::endgroup::"
-
-        echo "::group::Build ${{ matrix.model }} Benchmark"
-        cmake -DCMAKE_BUILD_TYPE=Release \
-              -DEXECUTORCH_BUILD_CUDA=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
-              -DEXECUTORCH_BUILD_TESTS=ON \
-              -Bcmake-out .
-        cmake --build cmake-out -j$(nproc) --target multimodal_benchmark
-        echo "::endgroup::"
-
-        echo "::group::Run ${{ matrix.model.name }} Benchmark"
-
-        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
-        cmake-out/backends/cuda/multimodal_benchmark ${{ matrix.model.name }} model.pte aoti_cuda_blob.ptd
-
-        echo "::endgroup::"
-
   test-model-cuda-e2e:
     name: test-model-cuda-e2e
     needs: export-model-cuda-artifact
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 8f121bdbd32..ac97b9809bf 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -192,11 +192,3 @@ install(
   EXPORT ExecuTorchTargets
   DESTINATION lib
 )
-
-if(BUILD_TESTING)
-  add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp)
-  target_link_libraries(
-    multimodal_benchmark PUBLIC aoti_cuda_backend extension_module_static
-                                extension_flat_tensor portable_ops_lib
-  )
-endif()
diff --git a/backends/cuda/tests/multimodal_benchmark.cpp b/backends/cuda/tests/multimodal_benchmark.cpp
deleted file mode 100644
index 7365d0b7ba8..00000000000
--- a/backends/cuda/tests/multimodal_benchmark.cpp
+++ /dev/null
@@ -1,466 +0,0 @@
-#include <algorithm>
-#include <chrono>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <map>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include <executorch/extension/module/module.h>
-#include <executorch/extension/tensor/tensor_ptr.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/core/evalue.h>
-#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
-#include <executorch/runtime/core/portable_type/tensor.h>
-
-namespace {
-
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
-using executorch::extension::make_tensor_ptr;
-using executorch::extension::TensorPtr;
-using executorch::extension::module::Module;
-using executorch::runtime::Error;
-using executorch::runtime::EValue;
-using executorch::runtime::Result;
-using Clock = std::chrono::steady_clock;
-using executorch::aten::TensorShapeDynamism;
-using DurationMs = std::chrono::duration<double, std::milli>;
-
-enum class ModelType { GEMMA3, VOXTRAL, UNKNOWN };
-
-struct ModelConfig {
-  std::string name;
-  size_t token_seq_len;
-  size_t text_embed_dim;
-  std::vector<std::string> expected_methods;
-};
-
-const std::map<ModelType, ModelConfig> model_configs = {
-    {ModelType::GEMMA3,
-     {"gemma3",
-      128,
-      2304,
-      {"vision_encoder", "token_embedding", "text_decoder"}}},
-    {ModelType::VOXTRAL,
-     {"voxtral",
-      1138,
-      3072,
-      {"audio_encoder", "token_embedding", "text_decoder"}}}};
-
-ModelType parse_model_type(const std::string& model_name) {
-  std::string lower_name = model_name;
-  std::transform(
-      lower_name.begin(),
-      lower_name.end(),
-      lower_name.begin(),
-      [](unsigned char c) { return std::tolower(c); });
-
-  if (lower_name.find("gemma3") != std::string::npos ||
-      lower_name.find("gemma-3") != std::string::npos) {
-    return ModelType::GEMMA3;
-  } else if (lower_name.find("voxtral") != std::string::npos) {
-    return ModelType::VOXTRAL;
-  }
-  return ModelType::UNKNOWN;
-}
-
-std::vector<executorch::aten::SizesType> to_sizes(
-    std::initializer_list<int64_t> dims) {
-  return std::vector<executorch::aten::SizesType>(dims.begin(), dims.end());
-}
-
-std::string format_shape(const Tensor& tensor) {
-  std::ostringstream oss;
-  oss << "[";
-  const auto& sizes = tensor.sizes();
-  for (size_t i = 0; i < sizes.size(); ++i) {
-    if (i > 0) {
-      oss << ", ";
-    }
-    oss << sizes[i];
-  }
-  oss << "]";
-  return oss.str();
-}
-
-void print_tensor_summary(const std::string& label, const Tensor& tensor) {
-  std::cout << "    " << label
-            << ": dtype=" << executorch::runtime::toString(tensor.scalar_type())
-            << ", shape=" << format_shape(tensor)
-            << ", numel=" << tensor.numel() << std::endl;
-}
-
-void dump_tensor_to_file(const std::string& filename, const Tensor& tensor) {
-  std::ofstream file(filename, std::ios::binary);
-  if (!file.is_open()) {
-    std::cerr << "Failed to open file for writing: " << filename << std::endl;
-    return;
-  }
-
-  int32_t dtype = static_cast<int32_t>(tensor.scalar_type());
-  file.write(reinterpret_cast<const char*>(&dtype), sizeof(int32_t));
-
-  int32_t ndim = static_cast<int32_t>(tensor.sizes().size());
-  file.write(reinterpret_cast<const char*>(&ndim), sizeof(int32_t));
-
-  for (size_t i = 0; i < tensor.sizes().size(); ++i) {
-    int64_t dim_size = tensor.sizes()[i];
-    file.write(reinterpret_cast<const char*>(&dim_size), sizeof(int64_t));
-  }
-
-  const void* data_ptr = tensor.const_data_ptr();
-  size_t element_size = 0;
-
-  switch (tensor.scalar_type()) {
-    case ScalarType::Float:
-      element_size = sizeof(float);
-      break;
-    case ScalarType::BFloat16:
-      element_size = 2;
-      break;
-    case ScalarType::Half:
-      element_size = 2;
-      break;
-    case ScalarType::Long:
-      element_size = sizeof(int64_t);
-      break;
-    case ScalarType::Int:
-      element_size = sizeof(int32_t);
-      break;
-    default:
-      std::cerr << "Unsupported dtype for dumping: "
-                << executorch::runtime::toString(tensor.scalar_type())
-                << std::endl;
-      return;
-  }
-
-  size_t data_size = tensor.numel() * element_size;
-  file.write(reinterpret_cast<const char*>(data_ptr), data_size);
-  file.close();
-
-  std::cout << "Dumped tensor to: " << filename << std::endl;
-}
-
-TensorPtr create_vision_input() {
-  const auto sizes = to_sizes({1, 3, 896, 896});
-  const size_t numel = 1ull * 3ull * 896ull * 896ull;
-  std::vector<float> data(numel);
-  for (size_t i = 0; i < numel; ++i) {
-    data[i] = static_cast<float>((i % 255) / 255.0);
-  }
-  return make_tensor_ptr<float>(
-      sizes,
-      std::move(data),
-      {},
-      {},
-      ScalarType::BFloat16,
-      TensorShapeDynamism::DYNAMIC_UNBOUND);
-}
-
-TensorPtr create_audio_input() {
-  const auto sizes = to_sizes({3, 128, 3000});
-  const size_t numel = 3ull * 128ull * 3000ull;
-  std::vector<float> data(numel, 0.5f);
-  return make_tensor_ptr<float>(
-      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
-}
-
-TensorPtr create_token_ids_input(const ModelConfig& config) {
-  const auto sizes = to_sizes({1, static_cast<int64_t>(config.token_seq_len)});
-  std::vector<int64_t> data(config.token_seq_len);
-  for (size_t i = 0; i < config.token_seq_len; ++i) {
-    data[i] = static_cast<int64_t>(i + 1);
-  }
-  return make_tensor_ptr<int64_t>(sizes, std::move(data));
-}
-
-TensorPtr create_positions_input(const ModelConfig& config) {
-  const auto sizes = to_sizes({static_cast<int64_t>(config.token_seq_len)});
-  std::vector<int64_t> data(config.token_seq_len);
-  for (size_t i = 0; i < config.token_seq_len; ++i) {
-    data[i] = static_cast<int64_t>(i);
-  }
-  return make_tensor_ptr<int64_t>(sizes, std::move(data));
-}
-
-TensorPtr create_fallback_text_embedding(const ModelConfig& config) {
-  const auto sizes = to_sizes(
-      {1,
-       static_cast<int64_t>(config.token_seq_len),
-       static_cast<int64_t>(config.text_embed_dim)});
-  const size_t numel = 1ull * config.token_seq_len * config.text_embed_dim;
-  std::vector<float> data(numel, 0.0f);
-  return make_tensor_ptr<float>(
-      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
-}
-
-struct MethodTiming {
-  double load_ms{0.0};
-  double run_ms{0.0};
-};
-
-enum class MethodCategory { ENCODER, TOKEN_EMBEDDING, TEXT_DECODER, UNKNOWN };
-
-MethodCategory categorize_method(const std::string& method_name) {
-  std::string lower_name = method_name;
-  std::transform(
-      lower_name.begin(),
-      lower_name.end(),
-      lower_name.begin(),
-      [](unsigned char c) { return std::tolower(c); });
-
-  if (lower_name.find("vision") != std::string::npos ||
-      lower_name.find("audio") != std::string::npos ||
-      lower_name.find("encoder") != std::string::npos) {
-    return MethodCategory::ENCODER;
-  } else if (
-      lower_name.find("token") != std::string::npos &&
-      lower_name.find("embedding") != std::string::npos) {
-    return MethodCategory::TOKEN_EMBEDDING;
-  } else if (
-      lower_name.find("text") != std::string::npos &&
-      lower_name.find("decoder") != std::string::npos) {
-    return MethodCategory::TEXT_DECODER;
-  }
-  return MethodCategory::UNKNOWN;
-}
-
-std::vector<EValue> create_inputs_for_method(
-    const std::string& method_name,
-    MethodCategory category,
-    ModelType model_type,
-    const ModelConfig& config,
-    const EValue* token_output,
-    std::vector<TensorPtr>& owned_inputs) {
-  std::vector<EValue> inputs;
-
-  switch (category) {
-    case MethodCategory::ENCODER: {
-      if (method_name.find("vision") != std::string::npos) {
-        auto input = create_vision_input();
-        owned_inputs.emplace_back(input);
-        inputs.emplace_back(*input);
-      } else if (method_name.find("audio") != std::string::npos) {
-        auto input = create_audio_input();
-        owned_inputs.emplace_back(input);
-        inputs.emplace_back(*input);
-      }
-      break;
-    }
-
-    case MethodCategory::TOKEN_EMBEDDING: {
-      auto token_ids = create_token_ids_input(config);
-      owned_inputs.emplace_back(token_ids);
-      inputs.emplace_back(*token_ids);
-      break;
-    }
-
-    case MethodCategory::TEXT_DECODER: {
-      if (token_output && token_output->isTensor()) {
-        inputs.emplace_back(*token_output);
-      } else {
-        auto fallback_embedding = create_fallback_text_embedding(config);
-        owned_inputs.emplace_back(fallback_embedding);
-        inputs.emplace_back(*fallback_embedding);
-      }
-
-      auto positions = create_positions_input(config);
-      owned_inputs.emplace_back(positions);
-      inputs.emplace_back(*positions);
-      break;
-    }
-
-    default:
-      break;
-  }
-
-  return inputs;
-}
-
-Error execute_method(
-    Module& module,
-    const std::string& method_name,
-    MethodCategory category,
-    ModelType model_type,
-    const ModelConfig& config,
-    const EValue* token_output,
-    MethodTiming& timing,
-    EValue* output_storage = nullptr) {
-  ET_LOG(Info, "Loading %s...", method_name.c_str());
-
-  const auto load_start = Clock::now();
-  const Error load_err = module.load_method(method_name);
-  const auto load_end = Clock::now();
-  if (load_err != Error::Ok) {
-    std::cerr << "Failed to load method " << method_name << ": error code "
-              << static_cast<int>(load_err) << std::endl;
-    return load_err;
-  }
-  timing.load_ms = DurationMs(load_end - load_start).count();
-
-  std::vector<TensorPtr> owned_inputs;
-  std::vector<EValue> inputs = create_inputs_for_method(
-      method_name, category, model_type, config, token_output, owned_inputs);
-
-  const auto run_start = Clock::now();
-  ET_LOG(Info, "%s running", method_name.c_str());
-  Result<std::vector<EValue>> output_result =
-      module.execute(method_name, inputs);
-  ET_LOG(Info, "%s done", method_name.c_str());
-  const auto run_end = Clock::now();
-  timing.run_ms = DurationMs(run_end - run_start).count();
-
-  if (output_result.error() != Error::Ok) {
-    std::cerr << method_name << " execution failed: error code "
-              << static_cast<int>(output_result.error()) << std::endl;
-    return output_result.error();
-  }
-
-  const auto& outputs = output_result.get();
-  if (!outputs.empty() && outputs[0].isTensor()) {
-    print_tensor_summary(method_name + " output", outputs[0].toTensor());
-
-    if (category == MethodCategory::ENCODER ||
-        category == MethodCategory::TOKEN_EMBEDDING) {
-      dump_tensor_to_file(method_name + "_output.bin", outputs[0].toTensor());
-    }
-
-    if (output_storage) {
-      *output_storage = outputs[0];
-    }
-  }
-
-  return Error::Ok;
-}
-
-} // namespace
-
-int main(int argc, char** argv) {
-  if (argc != 4) {
-    std::cerr
-        << "Usage: " << argv[0]
-        << " <model_name> <path/to/model.pte> <path/to/aoti_cuda_blob.ptd>"
-        << std::endl;
-    std::cerr << "  model_name: gemma3 or voxtral" << std::endl;
-    return 1;
-  }
-
-  const std::string model_name = argv[1];
-  const std::string program_path = argv[2];
-  const std::string data_map_path = argv[3];
-
-  const ModelType model_type = parse_model_type(model_name);
-  if (model_type == ModelType::UNKNOWN) {
-    std::cerr << "Unknown model type: " << model_name << std::endl;
-    std::cerr << "Supported models: gemma3, voxtral" << std::endl;
-    return 1;
-  }
-
-  const ModelConfig& config = model_configs.at(model_type);
-  std::cout << "Running benchmark for model: " << config.name << std::endl;
-
-  try {
-    Module module(program_path, data_map_path);
-
-    const auto program_load_start = Clock::now();
-    const Error program_load_error = module.load();
-    const auto program_load_end = Clock::now();
-    if (program_load_error != Error::Ok) {
-      std::cerr << "Failed to load ExecuTorch program: error code "
-                << static_cast<int>(program_load_error) << std::endl;
-      return 1;
-    }
-    const DurationMs program_load_latency =
-        program_load_end - program_load_start;
-
-    auto method_names_result = module.method_names();
-    if (method_names_result.error() != Error::Ok) {
-      std::cerr << "Failed to get method names: error code "
-                << static_cast<int>(method_names_result.error()) << std::endl;
-      return 1;
-    }
-
-    const auto& available_methods = method_names_result.get();
-
-    std::cout << "Checking for expected methods..." << std::endl;
-    std::vector<std::string> missing_methods;
-    for (const auto& expected : config.expected_methods) {
-      if (available_methods.find(expected) == available_methods.end()) {
-        missing_methods.push_back(expected);
-      } else {
-        std::cout << "  ✓ " << expected << std::endl;
-      }
-    }
-
-    if (!missing_methods.empty()) {
-      std::cerr << "\nError: Missing expected methods:" << std::endl;
-      for (const auto& missing : missing_methods) {
-        std::cerr << "  ✗ " << missing << std::endl;
-      }
-      return 1;
-    }
-
-    std::map<std::string, MethodTiming> timings;
-    EValue token_output;
-    bool token_executed = false;
-
-    for (const auto& method_name : config.expected_methods) {
-      MethodCategory category = categorize_method(method_name);
-      MethodTiming timing;
-
-      const EValue* input_token_ptr =
-          (category == MethodCategory::TEXT_DECODER && token_executed)
-          ? &token_output
-          : nullptr;
-
-      EValue* output_storage = (category == MethodCategory::TOKEN_EMBEDDING)
-          ? &token_output
-          : nullptr;
-
-      Error err = execute_method(
-          module,
-          method_name,
-          category,
-          model_type,
-          config,
-          input_token_ptr,
-          timing,
-          output_storage);
-
-      if (err != Error::Ok) {
-        return 1;
-      }
-
-      if (category == MethodCategory::TOKEN_EMBEDDING) {
-        token_executed = true;
-      }
-
-      timings[method_name] = timing;
-    }
-
-    std::cout << std::fixed << std::setprecision(3);
-    std::cout << "\n=== Benchmark Results ===" << std::endl;
-    std::cout << "Program load latency (ms): " << program_load_latency.count()
-              << std::endl;
-
-    std::cout << "\nMethod load latency (ms):" << std::endl;
-    for (const auto& [name, timing] : timings) {
-      std::cout << "  " << name << ": " << timing.load_ms << std::endl;
-    }
-
-    std::cout << "\nRun latency (ms):" << std::endl;
-    for (const auto& [name, timing] : timings) {
-      std::cout << "  " << name << ": " << timing.run_ms << std::endl;
-    }
-
-    return 0;
-  } catch (const std::exception& ex) {
-    std::cerr << "Unhandled exception: " << ex.what() << std::endl;
-    return 1;
-  }
-}