From 0adcc00a08f67b251cf0d9b0a8cc2b0a4fac7cd6 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 14 Mar 2026 13:09:46 -0700 Subject: [PATCH 1/5] Add lm-eval benchmark runner for InferenceX evals Adds support for running lm-eval accuracy evaluations as a post-benchmark step, leveraging the InferenceX benchmark_lib.sh harness. - New LMEvalRunner registered as "lm-eval" benchmark type - bench.sh script sources benchmark_lib.sh and calls run_eval/append_lm_eval_summary - Post-benchmark eval hook in SweepOrchestrator.run() triggered by RUN_EVAL=true - Auto-mount INFMAX_WORKSPACE into container when env var is set Co-Authored-By: Claude Opus 4.6 --- docs/accuracy.md | 61 ++++++++++++++- src/srtctl/benchmarks/__init__.py | 3 +- src/srtctl/benchmarks/lm_eval.py | 58 ++++++++++++++ .../benchmarks/scripts/lm-eval/bench.sh | 76 +++++++++++++++++++ src/srtctl/cli/do_sweep.py | 66 ++++++++++++++++ src/srtctl/core/runtime.py | 8 ++ 6 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 src/srtctl/benchmarks/lm_eval.py create mode 100755 src/srtctl/benchmarks/scripts/lm-eval/bench.sh diff --git a/docs/accuracy.md b/docs/accuracy.md index f5588c9fe..768c6cfd4 100644 --- a/docs/accuracy.md +++ b/docs/accuracy.md @@ -1,6 +1,6 @@ # Accuracy Benchmarks -In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa` and `longbenchv2`. +In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa`, `longbenchv2`, and `lm-eval`. ## Table of Contents @@ -14,6 +14,7 @@ In srt-slurm, users can run different accuracy benchmarks by setting the benchma - [Example: Quick Validation](#example-quick-validation) - [Output](#output) - [Important Notes](#important-notes) +- [lm-eval (InferenceX)](#lm-eval-inferencex) --- @@ -191,3 +192,61 @@ The output includes per-category scores and aggregate metrics: 4. **Categories**: Running specific categories is useful for targeted validation (e.g., just testing summarization capabilities) +## lm-eval (InferenceX) + +The `lm-eval` benchmark runner integrates [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) via InferenceX's `benchmark_lib.sh`. Unlike the built-in benchmarks above, this runner sources evaluation logic from an external InferenceX workspace mounted at `/infmax-workspace`. + +This is used by InferenceX CI to run graded QnA (gsm8k, gpqa) against multi-node deployments on GB200/GB300. + +### How it works + +1. The runner script (`benchmarks/scripts/lm-eval/bench.sh`) auto-discovers the served model name from `/v1/models` +2. Sources `benchmark_lib.sh` from the InferenceX workspace +3. Runs `run_eval` and `append_lm_eval_summary` from benchmark_lib +4. Copies eval artifacts (`meta_env.json`, `results*.json`, `sample*.jsonl`) to `/logs/eval_results/` + +### EVAL_ONLY mode + +srt-slurm supports an `EVAL_ONLY` mode that skips the throughput benchmark entirely and runs only the lm-eval evaluation. This is controlled via environment variables: + +| Env var | Description | +|---------|-------------| +| `EVAL_ONLY` | Set to `true` to skip the throughput benchmark stage and run eval only | +| `RUN_EVAL` | Set to `true` to run eval after the throughput benchmark completes | +| `EVAL_CONC` | Concurrent requests for lm-eval (defaults to 256) | + +When `EVAL_ONLY=true`: +- **Stage 4 (Benchmark)** is skipped entirely — no throughput test runs +- **Health check** uses the full `wait_for_model()` check (polls for all prefill/decode workers to be ready) since the benchmark stage's health check was skipped +- **Stage 5 (Eval)** runs `_run_post_eval()` which launches the lm-eval benchmark runner +- Eval failure is **fatal** (non-zero exit) since eval is the only purpose of the job + +When `RUN_EVAL=true` (without `EVAL_ONLY`): +- Throughput benchmark runs normally +- After benchmark completes successfully, eval runs as a post-step +- Eval failure is **non-fatal** — the job still succeeds if throughput passed + +### Environment variables + +The following env vars are passed through to the lm-eval runner container: + +`FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `RESULT_FILENAME`, `SPEC_DECODING`, `ISL`, `OSL`, `PREFILL_TP`, `PREFILL_EP`, `PREFILL_DP_ATTN`, `DECODE_TP`, `DECODE_EP`, `DECODE_DP_ATTN`, `MODEL_NAME`, `EVAL_CONC`, `EVAL_ONLY`, `RUN_EVAL` + +### Concurrency + +Eval concurrency is set via the `EVAL_CONCURRENT_REQUESTS` environment variable (read by `benchmark_lib.sh`). The runner script sets this from `EVAL_CONC`: + +```bash +export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-256}" +``` + +The InferenceX workflow derives `EVAL_CONC` from the highest value in the benchmark concurrency list. + +### Output + +Eval artifacts are written to `/logs/eval_results/` inside the container: +- `meta_env.json` — metadata (TP, conc, framework, precision, etc.) +- `results*.json` — lm-eval scores per task +- `sample*.jsonl` — per-sample outputs + +These are collected by the InferenceX launch scripts (`launch_gb200-nv.sh`, `launch_gb300-nv.sh`) and uploaded as workflow artifacts. diff --git a/src/srtctl/benchmarks/__init__.py b/src/srtctl/benchmarks/__init__.py index 84806b06b..b0c4ff7f1 100644 --- a/src/srtctl/benchmarks/__init__.py +++ b/src/srtctl/benchmarks/__init__.py @@ -4,7 +4,7 @@ """Benchmark runners for srtctl.""" # Import runners to trigger registration -from srtctl.benchmarks import gpqa, longbenchv2, mmlu, mooncake_router, profiling, router, sa_bench +from srtctl.benchmarks import gpqa, lm_eval, longbenchv2, mmlu, mooncake_router, profiling, router, sa_bench from srtctl.benchmarks.base import ( BenchmarkRunner, get_runner, @@ -18,6 +18,7 @@ "list_benchmarks", "register_benchmark", # Runners + "lm_eval", "sa_bench", "mmlu", "gpqa", diff --git a/src/srtctl/benchmarks/lm_eval.py b/src/srtctl/benchmarks/lm_eval.py new file mode 100644 index 000000000..b6fae05f0 --- /dev/null +++ b/src/srtctl/benchmarks/lm_eval.py @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""lm-eval benchmark runner for InferenceX evals.""" + +from __future__ import annotations + +import os +from typing import TYPE_CHECKING + +from srtctl.benchmarks.base import SCRIPTS_DIR, BenchmarkRunner, register_benchmark + +if TYPE_CHECKING: + from srtctl.core.runtime import RuntimeContext + from srtctl.core.schema import SrtConfig + + +@register_benchmark("lm-eval") +class LMEvalRunner(BenchmarkRunner): + """lm-eval accuracy evaluation using InferenceX benchmark_lib. + + Runs lm-eval via the InferenceX benchmark_lib.sh harness, + which handles task selection, result collection, and summary generation. + """ + + @property + def name(self) -> str: + return "lm-eval" + + @property + def script_path(self) -> str: + return "/srtctl-benchmarks/lm-eval/bench.sh" + + @property + def local_script_dir(self) -> str: + return str(SCRIPTS_DIR / "lm-eval") + + def validate_config(self, config: SrtConfig) -> list[str]: + # lm-eval has sensible defaults + return [] + + def build_command( + self, + config: SrtConfig, + runtime: RuntimeContext, + ) -> list[str]: + endpoint = f"http://localhost:{runtime.frontend_port}" + # Always use the container mount path, not the host path. + # INFMAX_WORKSPACE env var contains the host path (used for mount setup + # in runtime.py), but inside the container it's at /infmax-workspace. + infmax_workspace = "/infmax-workspace" + + return [ + "bash", + self.script_path, + endpoint, + infmax_workspace, + ] diff --git a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh new file mode 100755 index 000000000..bc251d559 --- /dev/null +++ b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# lm-eval accuracy evaluation using InferenceX benchmark_lib +# Expects: endpoint [infmax_workspace] + +set -e + +ENDPOINT=$1 +INFMAX_WORKSPACE=${2:-/infmax-workspace} + +# Extract HOST and PORT from endpoint (e.g., http://localhost:8000) +HOST=$(echo "$ENDPOINT" | sed -E 's|https?://||; s|:.*||') +PORT=$(echo "$ENDPOINT" | sed -E 's|.*:([0-9]+).*|\1|') + +echo "lm-eval Config: endpoint=${ENDPOINT}; host=${HOST}; port=${PORT}; workspace=${INFMAX_WORKSPACE}" + +# Auto-discover the served model name from /v1/models if MODEL_NAME is not set. +# This ensures we use the exact name the server recognizes, regardless of what +# $MODEL (the HuggingFace ID from the workflow) is set to. +if [[ -z "${MODEL_NAME:-}" ]]; then + DISCOVERED_MODEL=$(curl -sf "${ENDPOINT}/v1/models" 2>/dev/null \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['data'][0]['id'])" 2>/dev/null || true) + if [[ -n "$DISCOVERED_MODEL" ]]; then + export MODEL_NAME="$DISCOVERED_MODEL" + echo "Auto-discovered MODEL_NAME from /v1/models: ${MODEL_NAME}" + else + echo "WARNING: Could not discover model name from /v1/models, using MODEL_NAME=${MODEL_NAME:-$MODEL}" + fi +else + echo "Using MODEL_NAME from environment: ${MODEL_NAME}" +fi + +# cd to workspace so that relative paths (e.g., utils/evals/*.yaml) resolve +cd "${INFMAX_WORKSPACE}" + +# Source the InferenceX benchmark library +source "${INFMAX_WORKSPACE}/benchmarks/benchmark_lib.sh" + +# Run lm-eval via benchmark_lib +EVAL_CONC=256 +EVAL_LIMIT="${EVAL_LIMIT:-100}" +echo "Running lm-eval with concurrent-requests=${EVAL_CONC}, limit=${EVAL_LIMIT}..." +run_eval --framework lm-eval --port "$PORT" --concurrent-requests "$EVAL_CONC" --limit "$EVAL_LIMIT" + +# Set metadata env vars needed by append_lm_eval_summary +# These are passed through from the InferenceX environment +export TP="${TP:-${PREFILL_TP:-1}}" +export CONC="${CONC:-${EVAL_CONC}}" +export EP_SIZE="${EP_SIZE:-1}" +if [[ "${PREFILL_EP:-false}" == "true" ]]; then + EP_SIZE="${PREFILL_TP:-1}" +fi +export EP_SIZE +export DP_ATTENTION="${DP_ATTENTION:-${PREFILL_DP_ATTN:-false}}" +export ISL="${ISL:-}" +export OSL="${OSL:-}" +export FRAMEWORK="${FRAMEWORK:-}" +export PRECISION="${PRECISION:-}" +export MODEL_PREFIX="${MODEL_PREFIX:-}" +export RUNNER_TYPE="${RUNNER_TYPE:-}" +export RESULT_FILENAME="${RESULT_FILENAME:-}" + +# Generate the lm-eval summary +echo "Generating lm-eval summary..." +append_lm_eval_summary + +# Copy eval artifacts to /logs/eval_results/ +mkdir -p /logs/eval_results +echo "Copying eval artifacts to /logs/eval_results/..." +cp -v meta_env.json /logs/eval_results/ 2>/dev/null || true +cp -v results*.json /logs/eval_results/ 2>/dev/null || true +cp -v sample*.jsonl /logs/eval_results/ 2>/dev/null || true + +echo "lm-eval evaluation complete" diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py index 77f3d8d37..9f6e5777c 100644 --- a/src/srtctl/cli/do_sweep.py +++ b/src/srtctl/cli/do_sweep.py @@ -18,6 +18,7 @@ import os import sys import threading +import time from dataclasses import dataclass from pathlib import Path @@ -178,6 +179,61 @@ def _print_connection_info(self) -> None: logger.info("=" * 60) logger.info("") + def _run_post_eval(self, stop_event: threading.Event) -> int: + """Run lm-eval after the main benchmark completes.""" + from srtctl.benchmarks import get_runner + + # Health check: verify server is still up + if not wait_for_port(self.runtime.nodes.head, 8000, timeout=30): + logger.error("Server health check failed before eval - skipping") + return 1 + + try: + runner = get_runner("lm-eval") + except ValueError as e: + logger.error("lm-eval runner not available: %s", e) + return 1 + + eval_log = self.runtime.log_dir / "eval.out" + cmd = runner.build_command(self.config, self.runtime) + + logger.info("Eval command: %s", " ".join(cmd)) + logger.info("Eval log: %s", eval_log) + + # Pass through eval-related env vars + env_to_set = {} + for var in ["RUN_EVAL", "FRAMEWORK", "PRECISION", "MODEL_PREFIX", "RUNNER_TYPE", + "RESULT_FILENAME", "SPEC_DECODING", "ISL", "OSL", + "PREFILL_TP", "PREFILL_EP", "PREFILL_DP_ATTN", + "DECODE_TP", "DECODE_EP", "DECODE_DP_ATTN"]: + val = os.environ.get(var) + if val: + env_to_set[var] = val + + # Set MODEL_NAME to the served model name so lm-eval uses the correct + # name for API requests. Without this, benchmark_lib.sh falls back to + # $MODEL (the HuggingFace ID) which the server doesn't recognize. + env_to_set["MODEL_NAME"] = self.config.served_model_name + logger.info("Eval MODEL_NAME: %s", env_to_set["MODEL_NAME"]) + + proc = start_srun_process( + command=cmd, + nodelist=[self.runtime.nodes.head], + output=str(eval_log), + container_image=str(self.runtime.container_image), + container_mounts=self.runtime.container_mounts, + env_to_set=env_to_set, + ) + + while proc.poll() is None: + if stop_event.is_set(): + logger.info("Stop requested, terminating eval") + proc.terminate() + return 1 + time.sleep(1) + + return proc.returncode or 0 + def run(self) -> int: """Run the complete sweep.""" # Create status reporter (fire-and-forget, no-op if not configured) @@ -229,6 +285,16 @@ def run(self) -> int: # Stage 4: Benchmark (status reported AFTER health check passes) exit_code = self.run_benchmark(registry, stop_event, reporter) + # Stage 5: Post-benchmark eval (optional, non-fatal) + if os.environ.get("RUN_EVAL", "false").lower() == "true" and exit_code == 0: + reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running post-benchmark evaluation") + logger.info("RUN_EVAL=true: Running post-benchmark lm-eval evaluation...") + eval_exit = self._run_post_eval(stop_event) + if eval_exit != 0: + logger.warning("Eval failed with exit code %d (benchmark result is still valid)", eval_exit) + else: + logger.info("Post-benchmark eval completed successfully") + except Exception as e: logger.exception("Error during sweep: %s", e) reporter.report(JobStatus.FAILED, JobStage.CLEANUP, str(e)) diff --git a/src/srtctl/core/runtime.py b/src/srtctl/core/runtime.py index 198d6ae2d..4b4351527 100644 --- a/src/srtctl/core/runtime.py +++ b/src/srtctl/core/runtime.py @@ -215,6 +215,14 @@ def from_config( host_path, container_path = mount_spec.split(":", 1) container_mounts[Path(host_path).resolve()] = Path(container_path) + # Mount InferenceX workspace if available (for lm-eval support). + # Skip exists() check: the orchestrator runs on the SLURM head node + # where the GH Actions workspace path may not be directly accessible, + # but it IS accessible from compute nodes via shared filesystem. + infmax_ws = os.environ.get("INFMAX_WORKSPACE") + if infmax_ws: + container_mounts[Path(infmax_ws)] = Path("/infmax-workspace") + # Add FormattablePath mounts from config.container_mounts # These need to be expanded with the runtime context, so we create a # temporary context first and then update From 9b16cb443b6403d97e01bde17a74b53eec6f897f Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 30 Mar 2026 22:46:12 -0700 Subject: [PATCH 2/5] Fix EVAL_ONLY: use full health check before eval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In eval-only mode the benchmark stage is skipped, which also skips its model health check. The 30s port check in _run_post_eval is insufficient — workers are still loading. Use wait_for_model() with the full health check config (same as benchmark stage) when EVAL_ONLY=true. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/srtctl/cli/do_sweep.py | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py index 9f6e5777c..568a8abc9 100644 --- a/src/srtctl/cli/do_sweep.py +++ b/src/srtctl/cli/do_sweep.py @@ -180,13 +180,36 @@ def _print_connection_info(self) -> None: logger.info("") def _run_post_eval(self, stop_event: threading.Event) -> int: - """Run lm-eval after the main benchmark completes.""" + """Run lm-eval after the main benchmark completes (or directly in eval-only mode).""" from srtctl.benchmarks import get_runner - - # Health check: verify server is still up - if not wait_for_port(self.runtime.nodes.head, 8000, timeout=30): - logger.error("Server health check failed before eval - skipping") - return 1 + from srtctl.core.health import wait_for_model + + # In eval-only mode the benchmark health check was skipped, so do the + # full model-ready wait here. In post-benchmark mode a quick port + # check is sufficient since the server already served traffic. + if os.environ.get("EVAL_ONLY", "false").lower() == "true": + r = self.config.resources + n_prefill = 0 if r.num_agg > 0 else r.num_prefill + n_decode = r.num_agg if r.num_agg > 0 else r.num_decode + hc = self.config.health_check + logger.info("EVAL_ONLY: Waiting for server health before eval...") + if not wait_for_model( + host=self.runtime.nodes.head, + port=8000, + n_prefill=n_prefill, + n_decode=n_decode, + poll_interval=float(hc.interval_seconds), + timeout=float(hc.max_attempts * hc.interval_seconds), + report_every=60.0, + frontend_type=self.config.frontend.type, + stop_event=stop_event, + ): + logger.error("Server did not become healthy for eval") + return 1 + else: + if not wait_for_port(self.runtime.nodes.head, 8000, timeout=30): + logger.error("Server health check failed before eval - skipping") + return 1 try: runner = get_runner("lm-eval") From 211c454a73b447c2a65a5f17264b74f0f5a876f0 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 15 Mar 2026 07:43:11 -0700 Subject: [PATCH 3/5] Use max benchmark concurrency for eval instead of --limit Instead of capping eval examples with --limit to avoid timeouts, use the highest benchmark concurrency for eval requests. This runs the full eval set faster by matching the throughput the server was already benchmarked at. do_sweep.py computes max(config.benchmark.concurrencies) and passes it as EVAL_CONC to the lm-eval bench script. Co-Authored-By: Claude Opus 4.6 --- docs/accuracy.md | 4 ++-- src/srtctl/benchmarks/scripts/lm-eval/bench.sh | 9 +++++---- src/srtctl/cli/do_sweep.py | 12 ++++++++++++ 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/docs/accuracy.md b/docs/accuracy.md index 768c6cfd4..d6dc975d7 100644 --- a/docs/accuracy.md +++ b/docs/accuracy.md @@ -213,7 +213,7 @@ srt-slurm supports an `EVAL_ONLY` mode that skips the throughput benchmark entir |---------|-------------| | `EVAL_ONLY` | Set to `true` to skip the throughput benchmark stage and run eval only | | `RUN_EVAL` | Set to `true` to run eval after the throughput benchmark completes | -| `EVAL_CONC` | Concurrent requests for lm-eval (defaults to 256) | +| `EVAL_CONC` | Concurrent requests for lm-eval (set by InferenceX to median of conc list; defaults to 256 if unset) | When `EVAL_ONLY=true`: - **Stage 4 (Benchmark)** is skipped entirely — no throughput test runs @@ -240,7 +240,7 @@ Eval concurrency is set via the `EVAL_CONCURRENT_REQUESTS` environment variable export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-256}" ``` -The InferenceX workflow derives `EVAL_CONC` from the highest value in the benchmark concurrency list. +The InferenceX workflow sets `EVAL_CONC` to the median of the benchmark concurrency list (chosen in `mark_eval_entries`). If `EVAL_CONC` is not set in the environment, `do_sweep.py` falls back to the max of the benchmark concurrency list. ### Output diff --git a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh index bc251d559..880c974ca 100755 --- a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh +++ b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh @@ -39,10 +39,11 @@ cd "${INFMAX_WORKSPACE}" source "${INFMAX_WORKSPACE}/benchmarks/benchmark_lib.sh" # Run lm-eval via benchmark_lib -EVAL_CONC=256 -EVAL_LIMIT="${EVAL_LIMIT:-100}" -echo "Running lm-eval with concurrent-requests=${EVAL_CONC}, limit=${EVAL_LIMIT}..." -run_eval --framework lm-eval --port "$PORT" --concurrent-requests "$EVAL_CONC" --limit "$EVAL_LIMIT" +# EVAL_CONC is set by the InferenceX workflow (median of conc list). +# benchmark_lib reads concurrency from EVAL_CONCURRENT_REQUESTS env var. +export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-256}" +echo "Running lm-eval with concurrent-requests=${EVAL_CONCURRENT_REQUESTS}..." +run_eval --framework lm-eval --port "$PORT" # Set metadata env vars needed by append_lm_eval_summary # These are passed through from the InferenceX environment diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py index 568a8abc9..abc4a862b 100644 --- a/src/srtctl/cli/do_sweep.py +++ b/src/srtctl/cli/do_sweep.py @@ -239,6 +239,18 @@ def _run_post_eval(self, stop_event: threading.Event) -> int: env_to_set["MODEL_NAME"] = self.config.served_model_name logger.info("Eval MODEL_NAME: %s", env_to_set["MODEL_NAME"]) + # Use EVAL_CONC from workflow (median chosen by InferenceX mark_eval_entries), + # falling back to max of benchmark concurrency list. + eval_conc = os.environ.get("EVAL_CONC") + if eval_conc: + env_to_set["EVAL_CONC"] = eval_conc + logger.info("Eval concurrency (from workflow): %s", eval_conc) + else: + conc_list = self.config.benchmark.get_concurrency_list() + if conc_list: + env_to_set["EVAL_CONC"] = str(max(conc_list)) + logger.info("Eval concurrency (max of %s): %s", conc_list, env_to_set["EVAL_CONC"]) + proc = start_srun_process( command=cmd, nodelist=[self.runtime.nodes.head], From 702ff0078c12fea80e33bcefc395a9305e2bca4a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 6 Apr 2026 16:56:31 -0700 Subject: [PATCH 4/5] update docs, clean up code --- docs/accuracy.md | 63 +++++++++++++------ .../benchmarks/scripts/lm-eval/bench.sh | 29 ++++++--- src/srtctl/cli/do_sweep.py | 63 ++++++++++++++----- 3 files changed, 110 insertions(+), 45 deletions(-) diff --git a/docs/accuracy.md b/docs/accuracy.md index d6dc975d7..98b69b468 100644 --- a/docs/accuracy.md +++ b/docs/accuracy.md @@ -196,57 +196,80 @@ The output includes per-category scores and aggregate metrics: The `lm-eval` benchmark runner integrates [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) via InferenceX's `benchmark_lib.sh`. Unlike the built-in benchmarks above, this runner sources evaluation logic from an external InferenceX workspace mounted at `/infmax-workspace`. -This is used by InferenceX CI to run graded QnA (gsm8k, gpqa) against multi-node deployments on GB200/GB300. +This is used by InferenceX CI to run evals such as GSM8K and GPQA against NVIDIA multi-node disaggregated deployments on GB200, GB300, B200, B300, H100, and H200. AMD MI355X multi-node evals are handled by InferenceX's upstreamed AMD Slurm path, not by this srt-slurm runner. + +In InferenceX CI, recipes normally keep their throughput benchmark configuration. `do_sweep.py` invokes the registered `lm-eval` runner as a post-step when `RUN_EVAL=true`, or as the only benchmark-like step when `EVAL_ONLY=true`. There is no separate `infmax-eval` benchmark type. ### How it works -1. The runner script (`benchmarks/scripts/lm-eval/bench.sh`) auto-discovers the served model name from `/v1/models` -2. Sources `benchmark_lib.sh` from the InferenceX workspace -3. Runs `run_eval` and `append_lm_eval_summary` from benchmark_lib -4. Copies eval artifacts (`meta_env.json`, `results*.json`, `sample*.jsonl`) to `/logs/eval_results/` +1. `RuntimeContext` mounts the host path from `INFMAX_WORKSPACE` at `/infmax-workspace` inside the Slurm container. +2. `do_sweep.py` starts infrastructure, workers, and the frontend for the normal recipe topology. +3. For `EVAL_ONLY=true`, `do_sweep.py` skips the throughput benchmark stage and runs `_run_post_eval()` directly after frontend startup. +4. `_run_post_eval()` waits for the OpenAI-compatible endpoint on port 8000 and, in eval-only mode, performs the full `wait_for_model()` health check for the configured prefill/decode or aggregated topology. +5. `_run_post_eval()` launches the registered `lm-eval` runner on the head node and passes through InferenceX metadata such as framework, precision, sequence length, prefill/decode topology, and eval concurrency. +6. The runner script (`benchmarks/scripts/lm-eval/bench.sh`) uses `MODEL_NAME` from `do_sweep.py`, or auto-discovers the served model from `/v1/models` as a fallback. +7. The runner sources `/infmax-workspace/benchmarks/benchmark_lib.sh`, runs `run_eval --framework lm-eval`, and calls `append_lm_eval_summary`. +8. Eval artifacts are copied to `/logs/eval_results/` for InferenceX launcher-side artifact pickup. ### EVAL_ONLY mode -srt-slurm supports an `EVAL_ONLY` mode that skips the throughput benchmark entirely and runs only the lm-eval evaluation. This is controlled via environment variables: +srt-slurm supports an `EVAL_ONLY` mode for CI jobs that should only validate accuracy. This is controlled by environment variables from the InferenceX workflow: | Env var | Description | |---------|-------------| | `EVAL_ONLY` | Set to `true` to skip the throughput benchmark stage and run eval only | | `RUN_EVAL` | Set to `true` to run eval after the throughput benchmark completes | -| `EVAL_CONC` | Concurrent requests for lm-eval (set by InferenceX to median of conc list; defaults to 256 if unset) | +| `EVAL_CONC` | Concurrent requests for lm-eval, normally set by InferenceX from the generated `eval-conc` value | +| `INFMAX_WORKSPACE` | Host path to the InferenceX checkout that should be mounted at `/infmax-workspace` | +| `MODEL_NAME` | Served model alias for OpenAI-compatible requests; set by `do_sweep.py` from `config.served_model_name` | When `EVAL_ONLY=true`: -- **Stage 4 (Benchmark)** is skipped entirely — no throughput test runs -- **Health check** uses the full `wait_for_model()` check (polls for all prefill/decode workers to be ready) since the benchmark stage's health check was skipped -- **Stage 5 (Eval)** runs `_run_post_eval()` which launches the lm-eval benchmark runner -- Eval failure is **fatal** (non-zero exit) since eval is the only purpose of the job +- Stage 4 skips the throughput benchmark entirely. No throughput result JSON is expected from srt-slurm. +- The eval path uses the full `wait_for_model()` health check before starting lm-eval. +- `_run_post_eval()` launches the `lm-eval` runner and returns its exit code. +- Eval failure is fatal because eval is the only purpose of the job. When `RUN_EVAL=true` (without `EVAL_ONLY`): - Throughput benchmark runs normally - After benchmark completes successfully, eval runs as a post-step -- Eval failure is **non-fatal** — the job still succeeds if throughput passed +- Eval failure is non-fatal; the benchmark job still succeeds if throughput passed ### Environment variables The following env vars are passed through to the lm-eval runner container: -`FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `RESULT_FILENAME`, `SPEC_DECODING`, `ISL`, `OSL`, `PREFILL_TP`, `PREFILL_EP`, `PREFILL_DP_ATTN`, `DECODE_TP`, `DECODE_EP`, `DECODE_DP_ATTN`, `MODEL_NAME`, `EVAL_CONC`, `EVAL_ONLY`, `RUN_EVAL` +| Env var | Purpose | +|---------|---------| +| `RUN_EVAL`, `EVAL_ONLY`, `IS_MULTINODE` | Control whether eval runs and how InferenceX classifies the artifact | +| `FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `SPEC_DECODING` | Benchmark identity metadata for `meta_env.json` | +| `ISL`, `OSL`, `RESULT_FILENAME` | Sequence length and result-file metadata | +| `MODEL`, `MODEL_PATH`, `MODEL_NAME` | Model metadata and the served model alias used for requests | +| `MAX_MODEL_LEN`, `EVAL_MAX_MODEL_LEN` | Context-length metadata used by InferenceX eval helpers when available | +| `PREFILL_TP`, `PREFILL_EP`, `PREFILL_NUM_WORKERS`, `PREFILL_DP_ATTN` | Prefill-side topology metadata | +| `DECODE_TP`, `DECODE_EP`, `DECODE_NUM_WORKERS`, `DECODE_DP_ATTN` | Decode-side topology metadata | +| `EVAL_CONC`, `EVAL_CONCURRENT_REQUESTS` | Eval concurrency controls | + +The runner maps srt-slurm's `PREFILL_DP_ATTN` and `DECODE_DP_ATTN` names to InferenceX's `PREFILL_DP_ATTENTION` and `DECODE_DP_ATTENTION` names before calling `append_lm_eval_summary`. This is required for multi-node summary tables to preserve prefill/decode DPA state. ### Concurrency -Eval concurrency is set via the `EVAL_CONCURRENT_REQUESTS` environment variable (read by `benchmark_lib.sh`). The runner script sets this from `EVAL_CONC`: +Eval concurrency is ultimately read by InferenceX's `benchmark_lib.sh` from `EVAL_CONCURRENT_REQUESTS`. The runner script sets that value from `EVAL_CONC` when present, preserves an existing `EVAL_CONCURRENT_REQUESTS` otherwise, and falls back to `256` only if neither variable is set: ```bash -export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-256}" +export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-256}}" ``` -The InferenceX workflow sets `EVAL_CONC` to the median of the benchmark concurrency list (chosen in `mark_eval_entries`). If `EVAL_CONC` is not set in the environment, `do_sweep.py` falls back to the max of the benchmark concurrency list. +The InferenceX workflow sets `EVAL_CONC` from the generated `eval-conc` value. For multi-node configs, InferenceX selects the `8k1k` entry with the highest max eligible concurrency for each `(model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn)` group, then sets `eval-conc` to the upper median of that config's eligible concurrency list. If `EVAL_CONC` is not set in the environment, `do_sweep.py` falls back to the max of the recipe benchmark concurrency list. ### Output Eval artifacts are written to `/logs/eval_results/` inside the container: -- `meta_env.json` — metadata (TP, conc, framework, precision, etc.) -- `results*.json` — lm-eval scores per task -- `sample*.jsonl` — per-sample outputs +- `meta_env.json` - metadata used by InferenceX aggregation and summary tables +- `results*.json` - lm-eval scores per task +- `sample*.jsonl` - per-sample outputs + +These are collected by the InferenceX NVIDIA launch scripts and uploaded as workflow artifacts. In eval-only mode the InferenceX workflow expects eval artifacts, not throughput benchmark artifacts. -These are collected by the InferenceX launch scripts (`launch_gb200-nv.sh`, `launch_gb300-nv.sh`) and uploaded as workflow artifacts. +### Intricacies +1. Eval floor of 16 + - There is 1 sweep config of conc: [1], which causes evals to take >4hrs to complete. diff --git a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh index 880c974ca..1cd47b8ad 100755 --- a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh +++ b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh @@ -41,20 +41,26 @@ source "${INFMAX_WORKSPACE}/benchmarks/benchmark_lib.sh" # Run lm-eval via benchmark_lib # EVAL_CONC is set by the InferenceX workflow (median of conc list). # benchmark_lib reads concurrency from EVAL_CONCURRENT_REQUESTS env var. -export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-256}" +export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-256}}" echo "Running lm-eval with concurrent-requests=${EVAL_CONCURRENT_REQUESTS}..." -run_eval --framework lm-eval --port "$PORT" +eval_rc=0 +run_eval --framework lm-eval --port "$PORT" || eval_rc=$? # Set metadata env vars needed by append_lm_eval_summary # These are passed through from the InferenceX environment +export IS_MULTINODE="${IS_MULTINODE:-true}" export TP="${TP:-${PREFILL_TP:-1}}" -export CONC="${CONC:-${EVAL_CONC}}" -export EP_SIZE="${EP_SIZE:-1}" -if [[ "${PREFILL_EP:-false}" == "true" ]]; then - EP_SIZE="${PREFILL_TP:-1}" -fi -export EP_SIZE +export CONC="${CONC:-${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-1}}}" +export EP_SIZE="${EP_SIZE:-${PREFILL_EP:-1}}" +export PREFILL_TP="${PREFILL_TP:-${TP:-1}}" +export PREFILL_EP="${PREFILL_EP:-${EP_SIZE:-1}}" +export PREFILL_NUM_WORKERS="${PREFILL_NUM_WORKERS:-1}" +export DECODE_TP="${DECODE_TP:-${TP:-1}}" +export DECODE_EP="${DECODE_EP:-${EP_SIZE:-1}}" +export DECODE_NUM_WORKERS="${DECODE_NUM_WORKERS:-1}" export DP_ATTENTION="${DP_ATTENTION:-${PREFILL_DP_ATTN:-false}}" +export PREFILL_DP_ATTENTION="${PREFILL_DP_ATTENTION:-${PREFILL_DP_ATTN:-${DP_ATTENTION:-false}}}" +export DECODE_DP_ATTENTION="${DECODE_DP_ATTENTION:-${DECODE_DP_ATTN:-${DP_ATTENTION:-false}}}" export ISL="${ISL:-}" export OSL="${OSL:-}" export FRAMEWORK="${FRAMEWORK:-}" @@ -65,7 +71,7 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}" # Generate the lm-eval summary echo "Generating lm-eval summary..." -append_lm_eval_summary +append_lm_eval_summary || true # Copy eval artifacts to /logs/eval_results/ mkdir -p /logs/eval_results @@ -74,4 +80,9 @@ cp -v meta_env.json /logs/eval_results/ 2>/dev/null || true cp -v results*.json /logs/eval_results/ 2>/dev/null || true cp -v sample*.jsonl /logs/eval_results/ 2>/dev/null || true +if [[ "$eval_rc" -ne 0 ]]; then + echo "lm-eval evaluation failed with exit code ${eval_rc}" + exit "$eval_rc" +fi + echo "lm-eval evaluation complete" diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py index abc4a862b..0cf429ecd 100644 --- a/src/srtctl/cli/do_sweep.py +++ b/src/srtctl/cli/do_sweep.py @@ -223,12 +223,34 @@ def _run_post_eval(self, stop_event: threading.Event) -> int: logger.info("Eval command: %s", " ".join(cmd)) logger.info("Eval log: %s", eval_log) - # Pass through eval-related env vars + # Pass through eval-related env vars. InferenceX writes multi-node + # metadata from these variables in append_lm_eval_summary(). env_to_set = {} - for var in ["RUN_EVAL", "FRAMEWORK", "PRECISION", "MODEL_PREFIX", "RUNNER_TYPE", - "RESULT_FILENAME", "SPEC_DECODING", "ISL", "OSL", - "PREFILL_TP", "PREFILL_EP", "PREFILL_DP_ATTN", - "DECODE_TP", "DECODE_EP", "DECODE_DP_ATTN"]: + for var in [ + "RUN_EVAL", + "EVAL_ONLY", + "IS_MULTINODE", + "FRAMEWORK", + "PRECISION", + "MODEL_PREFIX", + "RUNNER_TYPE", + "RESULT_FILENAME", + "SPEC_DECODING", + "ISL", + "OSL", + "MODEL", + "MODEL_PATH", + "MAX_MODEL_LEN", + "EVAL_MAX_MODEL_LEN", + "PREFILL_TP", + "PREFILL_EP", + "PREFILL_DP_ATTN", + "PREFILL_NUM_WORKERS", + "DECODE_TP", + "DECODE_EP", + "DECODE_DP_ATTN", + "DECODE_NUM_WORKERS", + ]: val = os.environ.get(var) if val: env_to_set[var] = val @@ -317,18 +339,27 @@ def run(self) -> int: self._print_connection_info() - # Stage 4: Benchmark (status reported AFTER health check passes) - exit_code = self.run_benchmark(registry, stop_event, reporter) - - # Stage 5: Post-benchmark eval (optional, non-fatal) - if os.environ.get("RUN_EVAL", "false").lower() == "true" and exit_code == 0: - reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running post-benchmark evaluation") - logger.info("RUN_EVAL=true: Running post-benchmark lm-eval evaluation...") - eval_exit = self._run_post_eval(stop_event) - if eval_exit != 0: - logger.warning("Eval failed with exit code %d (benchmark result is still valid)", eval_exit) + if os.environ.get("EVAL_ONLY", "false").lower() == "true": + reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running eval-only evaluation") + logger.info("EVAL_ONLY=true: Skipping benchmark stage and running lm-eval evaluation...") + exit_code = self._run_post_eval(stop_event) + if exit_code != 0: + logger.error("Eval-only evaluation failed with exit code %d", exit_code) else: - logger.info("Post-benchmark eval completed successfully") + logger.info("Eval-only evaluation completed successfully") + else: + # Stage 4: Benchmark (status reported AFTER health check passes) + exit_code = self.run_benchmark(registry, stop_event, reporter) + + # Stage 5: Post-benchmark eval (optional, non-fatal) + if os.environ.get("RUN_EVAL", "false").lower() == "true" and exit_code == 0: + reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running post-benchmark evaluation") + logger.info("RUN_EVAL=true: Running post-benchmark lm-eval evaluation...") + eval_exit = self._run_post_eval(stop_event) + if eval_exit != 0: + logger.warning("Eval failed with exit code %d (benchmark result is still valid)", eval_exit) + else: + logger.info("Post-benchmark eval completed successfully") except Exception as e: logger.exception("Error during sweep: %s", e) From 4fc6e27ffe7425db92a2079dc6863aa9b192dff4 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 6 Apr 2026 18:02:38 -0700 Subject: [PATCH 5/5] Clean up --- src/srtctl/benchmarks/lm_eval.py | 4 ---- .../benchmarks/scripts/lm-eval/bench.sh | 21 +++---------------- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/src/srtctl/benchmarks/lm_eval.py b/src/srtctl/benchmarks/lm_eval.py index b6fae05f0..7667e97fd 100644 --- a/src/srtctl/benchmarks/lm_eval.py +++ b/src/srtctl/benchmarks/lm_eval.py @@ -1,11 +1,7 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - """lm-eval benchmark runner for InferenceX evals.""" from __future__ import annotations -import os from typing import TYPE_CHECKING from srtctl.benchmarks.base import SCRIPTS_DIR, BenchmarkRunner, register_benchmark diff --git a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh index 1cd47b8ad..8a03333b3 100755 --- a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh +++ b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh @@ -1,7 +1,4 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - # lm-eval accuracy evaluation using InferenceX benchmark_lib # Expects: endpoint [infmax_workspace] @@ -46,28 +43,16 @@ echo "Running lm-eval with concurrent-requests=${EVAL_CONCURRENT_REQUESTS}..." eval_rc=0 run_eval --framework lm-eval --port "$PORT" || eval_rc=$? -# Set metadata env vars needed by append_lm_eval_summary -# These are passed through from the InferenceX environment +# Derive metadata env vars that append_lm_eval_summary needs but do_sweep.py +# does not pass directly (it passes PREFILL_TP/EP/etc, not TP/EP_SIZE/CONC). export IS_MULTINODE="${IS_MULTINODE:-true}" export TP="${TP:-${PREFILL_TP:-1}}" export CONC="${CONC:-${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-1}}}" export EP_SIZE="${EP_SIZE:-${PREFILL_EP:-1}}" -export PREFILL_TP="${PREFILL_TP:-${TP:-1}}" -export PREFILL_EP="${PREFILL_EP:-${EP_SIZE:-1}}" -export PREFILL_NUM_WORKERS="${PREFILL_NUM_WORKERS:-1}" -export DECODE_TP="${DECODE_TP:-${TP:-1}}" -export DECODE_EP="${DECODE_EP:-${EP_SIZE:-1}}" -export DECODE_NUM_WORKERS="${DECODE_NUM_WORKERS:-1}" export DP_ATTENTION="${DP_ATTENTION:-${PREFILL_DP_ATTN:-false}}" +# Remap srt-slurm's DP_ATTN names to InferenceX's DP_ATTENTION names export PREFILL_DP_ATTENTION="${PREFILL_DP_ATTENTION:-${PREFILL_DP_ATTN:-${DP_ATTENTION:-false}}}" export DECODE_DP_ATTENTION="${DECODE_DP_ATTENTION:-${DECODE_DP_ATTN:-${DP_ATTENTION:-false}}}" -export ISL="${ISL:-}" -export OSL="${OSL:-}" -export FRAMEWORK="${FRAMEWORK:-}" -export PRECISION="${PRECISION:-}" -export MODEL_PREFIX="${MODEL_PREFIX:-}" -export RUNNER_TYPE="${RUNNER_TYPE:-}" -export RESULT_FILENAME="${RESULT_FILENAME:-}" # Generate the lm-eval summary echo "Generating lm-eval summary..."