From 0adcc00a08f67b251cf0d9b0a8cc2b0a4fac7cd6 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 14 Mar 2026 13:09:46 -0700
Subject: [PATCH 1/5] Add lm-eval benchmark runner for InferenceX evals

Adds support for running lm-eval accuracy evaluations as a post-benchmark
step, leveraging the InferenceX benchmark_lib.sh harness.

- New LMEvalRunner registered as "lm-eval" benchmark type
- bench.sh script sources benchmark_lib.sh and calls run_eval/append_lm_eval_summary
- Post-benchmark eval hook in SweepOrchestrator.run() triggered by RUN_EVAL=true
- Auto-mount INFMAX_WORKSPACE into container when env var is set

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 docs/accuracy.md                              | 61 ++++++++++++++-
 src/srtctl/benchmarks/__init__.py             |  3 +-
 src/srtctl/benchmarks/lm_eval.py              | 58 ++++++++++++++
 .../benchmarks/scripts/lm-eval/bench.sh       | 76 +++++++++++++++++++
 src/srtctl/cli/do_sweep.py                    | 66 ++++++++++++++++
 src/srtctl/core/runtime.py                    |  8 ++
 6 files changed, 270 insertions(+), 2 deletions(-)
 create mode 100644 src/srtctl/benchmarks/lm_eval.py
 create mode 100755 src/srtctl/benchmarks/scripts/lm-eval/bench.sh

diff --git a/docs/accuracy.md b/docs/accuracy.md
index f5588c9fe..768c6cfd4 100644
--- a/docs/accuracy.md
+++ b/docs/accuracy.md
@@ -1,6 +1,6 @@
 # Accuracy Benchmarks
 
-In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa` and `longbenchv2`.
+In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa`, `longbenchv2`, and `lm-eval`.
 
 ## Table of Contents
 
@@ -14,6 +14,7 @@ In srt-slurm, users can run different accuracy benchmarks by setting the benchma
   - [Example: Quick Validation](#example-quick-validation)
   - [Output](#output)
   - [Important Notes](#important-notes)
+- [lm-eval (InferenceX)](#lm-eval-inferencex)
 
 ---
 
@@ -191,3 +192,61 @@ The output includes per-category scores and aggregate metrics:
 4. **Categories**: Running specific categories is useful for targeted validation (e.g., just testing summarization capabilities)
 
 
+## lm-eval (InferenceX)
+
+The `lm-eval` benchmark runner integrates [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) via InferenceX's `benchmark_lib.sh`. Unlike the built-in benchmarks above, this runner sources evaluation logic from an external InferenceX workspace mounted at `/infmax-workspace`.
+
+This is used by InferenceX CI to run graded QnA (gsm8k, gpqa) against multi-node deployments on GB200/GB300.
+
+### How it works
+
+1. The runner script (`benchmarks/scripts/lm-eval/bench.sh`) auto-discovers the served model name from `/v1/models`
+2. Sources `benchmark_lib.sh` from the InferenceX workspace
+3. Runs `run_eval` and `append_lm_eval_summary` from benchmark_lib
+4. Copies eval artifacts (`meta_env.json`, `results*.json`, `sample*.jsonl`) to `/logs/eval_results/`
+
+### EVAL_ONLY mode
+
+srt-slurm supports an `EVAL_ONLY` mode that skips the throughput benchmark entirely and runs only the lm-eval evaluation. This is controlled via environment variables:
+
+| Env var | Description |
+|---------|-------------|
+| `EVAL_ONLY` | Set to `true` to skip the throughput benchmark stage and run eval only |
+| `RUN_EVAL` | Set to `true` to run eval after the throughput benchmark completes |
+| `EVAL_CONC` | Concurrent requests for lm-eval (defaults to 256) |
+
+When `EVAL_ONLY=true`:
+- **Stage 4 (Benchmark)** is skipped entirely — no throughput test runs
+- **Health check** uses the full `wait_for_model()` check (polls for all prefill/decode workers to be ready) since the benchmark stage's health check was skipped
+- **Stage 5 (Eval)** runs `_run_post_eval()` which launches the lm-eval benchmark runner
+- Eval failure is **fatal** (non-zero exit) since eval is the only purpose of the job
+
+When `RUN_EVAL=true` (without `EVAL_ONLY`):
+- Throughput benchmark runs normally
+- After benchmark completes successfully, eval runs as a post-step
+- Eval failure is **non-fatal** — the job still succeeds if throughput passed
+
+### Environment variables
+
+The following env vars are passed through to the lm-eval runner container:
+
+`FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `RESULT_FILENAME`, `SPEC_DECODING`, `ISL`, `OSL`, `PREFILL_TP`, `PREFILL_EP`, `PREFILL_DP_ATTN`, `DECODE_TP`, `DECODE_EP`, `DECODE_DP_ATTN`, `MODEL_NAME`, `EVAL_CONC`, `EVAL_ONLY`, `RUN_EVAL`
+
+### Concurrency
+
+Eval concurrency is set via the `EVAL_CONCURRENT_REQUESTS` environment variable (read by `benchmark_lib.sh`). The runner script sets this from `EVAL_CONC`:
+
+```bash
+export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-256}"
+```
+
+The InferenceX workflow derives `EVAL_CONC` from the highest value in the benchmark concurrency list.
+
+### Output
+
+Eval artifacts are written to `/logs/eval_results/` inside the container:
+- `meta_env.json` — metadata (TP, conc, framework, precision, etc.)
+- `results*.json` — lm-eval scores per task
+- `sample*.jsonl` — per-sample outputs
+
+These are collected by the InferenceX launch scripts (`launch_gb200-nv.sh`, `launch_gb300-nv.sh`) and uploaded as workflow artifacts.
diff --git a/src/srtctl/benchmarks/__init__.py b/src/srtctl/benchmarks/__init__.py
index 84806b06b..b0c4ff7f1 100644
--- a/src/srtctl/benchmarks/__init__.py
+++ b/src/srtctl/benchmarks/__init__.py
@@ -4,7 +4,7 @@
 """Benchmark runners for srtctl."""
 
 # Import runners to trigger registration
-from srtctl.benchmarks import gpqa, longbenchv2, mmlu, mooncake_router, profiling, router, sa_bench
+from srtctl.benchmarks import gpqa, lm_eval, longbenchv2, mmlu, mooncake_router, profiling, router, sa_bench
 from srtctl.benchmarks.base import (
     BenchmarkRunner,
     get_runner,
@@ -18,6 +18,7 @@
     "list_benchmarks",
     "register_benchmark",
     # Runners
+    "lm_eval",
     "sa_bench",
     "mmlu",
     "gpqa",
diff --git a/src/srtctl/benchmarks/lm_eval.py b/src/srtctl/benchmarks/lm_eval.py
new file mode 100644
index 000000000..b6fae05f0
--- /dev/null
+++ b/src/srtctl/benchmarks/lm_eval.py
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""lm-eval benchmark runner for InferenceX evals."""
+
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING
+
+from srtctl.benchmarks.base import SCRIPTS_DIR, BenchmarkRunner, register_benchmark
+
+if TYPE_CHECKING:
+    from srtctl.core.runtime import RuntimeContext
+    from srtctl.core.schema import SrtConfig
+
+
+@register_benchmark("lm-eval")
+class LMEvalRunner(BenchmarkRunner):
+    """lm-eval accuracy evaluation using InferenceX benchmark_lib.
+
+    Runs lm-eval via the InferenceX benchmark_lib.sh harness,
+    which handles task selection, result collection, and summary generation.
+    """
+
+    @property
+    def name(self) -> str:
+        return "lm-eval"
+
+    @property
+    def script_path(self) -> str:
+        return "/srtctl-benchmarks/lm-eval/bench.sh"
+
+    @property
+    def local_script_dir(self) -> str:
+        return str(SCRIPTS_DIR / "lm-eval")
+
+    def validate_config(self, config: SrtConfig) -> list[str]:
+        # lm-eval has sensible defaults
+        return []
+
+    def build_command(
+        self,
+        config: SrtConfig,
+        runtime: RuntimeContext,
+    ) -> list[str]:
+        endpoint = f"http://localhost:{runtime.frontend_port}"
+        # Always use the container mount path, not the host path.
+        # INFMAX_WORKSPACE env var contains the host path (used for mount setup
+        # in runtime.py), but inside the container it's at /infmax-workspace.
+        infmax_workspace = "/infmax-workspace"
+
+        return [
+            "bash",
+            self.script_path,
+            endpoint,
+            infmax_workspace,
+        ]
diff --git a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
new file mode 100755
index 000000000..bc251d559
--- /dev/null
+++ b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# lm-eval accuracy evaluation using InferenceX benchmark_lib
+# Expects: endpoint [infmax_workspace]
+
+set -e
+
+ENDPOINT=$1
+INFMAX_WORKSPACE=${2:-/infmax-workspace}
+
+# Extract HOST and PORT from endpoint (e.g., http://localhost:8000)
+HOST=$(echo "$ENDPOINT" | sed -E 's|https?://||; s|:.*||')
+PORT=$(echo "$ENDPOINT" | sed -E 's|.*:([0-9]+).*|\1|')
+
+echo "lm-eval Config: endpoint=${ENDPOINT}; host=${HOST}; port=${PORT}; workspace=${INFMAX_WORKSPACE}"
+
+# Auto-discover the served model name from /v1/models if MODEL_NAME is not set.
+# This ensures we use the exact name the server recognizes, regardless of what
+# $MODEL (the HuggingFace ID from the workflow) is set to.
+if [[ -z "${MODEL_NAME:-}" ]]; then
+    DISCOVERED_MODEL=$(curl -sf "${ENDPOINT}/v1/models" 2>/dev/null \
+        | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['data'][0]['id'])" 2>/dev/null || true)
+    if [[ -n "$DISCOVERED_MODEL" ]]; then
+        export MODEL_NAME="$DISCOVERED_MODEL"
+        echo "Auto-discovered MODEL_NAME from /v1/models: ${MODEL_NAME}"
+    else
+        echo "WARNING: Could not discover model name from /v1/models, using MODEL_NAME=${MODEL_NAME:-$MODEL}"
+    fi
+else
+    echo "Using MODEL_NAME from environment: ${MODEL_NAME}"
+fi
+
+# cd to workspace so that relative paths (e.g., utils/evals/*.yaml) resolve
+cd "${INFMAX_WORKSPACE}"
+
+# Source the InferenceX benchmark library
+source "${INFMAX_WORKSPACE}/benchmarks/benchmark_lib.sh"
+
+# Run lm-eval via benchmark_lib
+EVAL_CONC=256
+EVAL_LIMIT="${EVAL_LIMIT:-100}"
+echo "Running lm-eval with concurrent-requests=${EVAL_CONC}, limit=${EVAL_LIMIT}..."
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests "$EVAL_CONC" --limit "$EVAL_LIMIT"
+
+# Set metadata env vars needed by append_lm_eval_summary
+# These are passed through from the InferenceX environment
+export TP="${TP:-${PREFILL_TP:-1}}"
+export CONC="${CONC:-${EVAL_CONC}}"
+export EP_SIZE="${EP_SIZE:-1}"
+if [[ "${PREFILL_EP:-false}" == "true" ]]; then
+    EP_SIZE="${PREFILL_TP:-1}"
+fi
+export EP_SIZE
+export DP_ATTENTION="${DP_ATTENTION:-${PREFILL_DP_ATTN:-false}}"
+export ISL="${ISL:-}"
+export OSL="${OSL:-}"
+export FRAMEWORK="${FRAMEWORK:-}"
+export PRECISION="${PRECISION:-}"
+export MODEL_PREFIX="${MODEL_PREFIX:-}"
+export RUNNER_TYPE="${RUNNER_TYPE:-}"
+export RESULT_FILENAME="${RESULT_FILENAME:-}"
+
+# Generate the lm-eval summary
+echo "Generating lm-eval summary..."
+append_lm_eval_summary
+
+# Copy eval artifacts to /logs/eval_results/
+mkdir -p /logs/eval_results
+echo "Copying eval artifacts to /logs/eval_results/..."
+cp -v meta_env.json /logs/eval_results/ 2>/dev/null || true
+cp -v results*.json /logs/eval_results/ 2>/dev/null || true
+cp -v sample*.jsonl /logs/eval_results/ 2>/dev/null || true
+
+echo "lm-eval evaluation complete"
diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py
index 77f3d8d37..9f6e5777c 100644
--- a/src/srtctl/cli/do_sweep.py
+++ b/src/srtctl/cli/do_sweep.py
@@ -18,6 +18,7 @@
 import os
 import sys
 import threading
+import time
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -178,6 +179,61 @@ def _print_connection_info(self) -> None:
         logger.info("=" * 60)
         logger.info("")
 
+    def _run_post_eval(self, stop_event: threading.Event) -> int:
+        """Run lm-eval after the main benchmark completes."""
+        from srtctl.benchmarks import get_runner
+
+        # Health check: verify server is still up
+        if not wait_for_port(self.runtime.nodes.head, 8000, timeout=30):
+            logger.error("Server health check failed before eval - skipping")
+            return 1
+
+        try:
+            runner = get_runner("lm-eval")
+        except ValueError as e:
+            logger.error("lm-eval runner not available: %s", e)
+            return 1
+
+        eval_log = self.runtime.log_dir / "eval.out"
+        cmd = runner.build_command(self.config, self.runtime)
+
+        logger.info("Eval command: %s", " ".join(cmd))
+        logger.info("Eval log: %s", eval_log)
+
+        # Pass through eval-related env vars
+        env_to_set = {}
+        for var in ["RUN_EVAL", "FRAMEWORK", "PRECISION", "MODEL_PREFIX", "RUNNER_TYPE",
+                    "RESULT_FILENAME", "SPEC_DECODING", "ISL", "OSL",
+                    "PREFILL_TP", "PREFILL_EP", "PREFILL_DP_ATTN",
+                    "DECODE_TP", "DECODE_EP", "DECODE_DP_ATTN"]:
+            val = os.environ.get(var)
+            if val:
+                env_to_set[var] = val
+
+        # Set MODEL_NAME to the served model name so lm-eval uses the correct
+        # name for API requests. Without this, benchmark_lib.sh falls back to
+        # $MODEL (the HuggingFace ID) which the server doesn't recognize.
+        env_to_set["MODEL_NAME"] = self.config.served_model_name
+        logger.info("Eval MODEL_NAME: %s", env_to_set["MODEL_NAME"])
+
+        proc = start_srun_process(
+            command=cmd,
+            nodelist=[self.runtime.nodes.head],
+            output=str(eval_log),
+            container_image=str(self.runtime.container_image),
+            container_mounts=self.runtime.container_mounts,
+            env_to_set=env_to_set,
+        )
+
+        while proc.poll() is None:
+            if stop_event.is_set():
+                logger.info("Stop requested, terminating eval")
+                proc.terminate()
+                return 1
+            time.sleep(1)
+
+        return proc.returncode or 0
+
     def run(self) -> int:
         """Run the complete sweep."""
         # Create status reporter (fire-and-forget, no-op if not configured)
@@ -229,6 +285,16 @@ def run(self) -> int:
             # Stage 4: Benchmark (status reported AFTER health check passes)
             exit_code = self.run_benchmark(registry, stop_event, reporter)
 
+            # Stage 5: Post-benchmark eval (optional, non-fatal)
+            if os.environ.get("RUN_EVAL", "false").lower() == "true" and exit_code == 0:
+                reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running post-benchmark evaluation")
+                logger.info("RUN_EVAL=true: Running post-benchmark lm-eval evaluation...")
+                eval_exit = self._run_post_eval(stop_event)
+                if eval_exit != 0:
+                    logger.warning("Eval failed with exit code %d (benchmark result is still valid)", eval_exit)
+                else:
+                    logger.info("Post-benchmark eval completed successfully")
+
         except Exception as e:
             logger.exception("Error during sweep: %s", e)
             reporter.report(JobStatus.FAILED, JobStage.CLEANUP, str(e))
diff --git a/src/srtctl/core/runtime.py b/src/srtctl/core/runtime.py
index 198d6ae2d..4b4351527 100644
--- a/src/srtctl/core/runtime.py
+++ b/src/srtctl/core/runtime.py
@@ -215,6 +215,14 @@ def from_config(
                 host_path, container_path = mount_spec.split(":", 1)
                 container_mounts[Path(host_path).resolve()] = Path(container_path)
 
+        # Mount InferenceX workspace if available (for lm-eval support).
+        # Skip exists() check: the orchestrator runs on the SLURM head node
+        # where the GH Actions workspace path may not be directly accessible,
+        # but it IS accessible from compute nodes via shared filesystem.
+        infmax_ws = os.environ.get("INFMAX_WORKSPACE")
+        if infmax_ws:
+            container_mounts[Path(infmax_ws)] = Path("/infmax-workspace")
+
         # Add FormattablePath mounts from config.container_mounts
         # These need to be expanded with the runtime context, so we create a
         # temporary context first and then update

From 9b16cb443b6403d97e01bde17a74b53eec6f897f Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 30 Mar 2026 22:46:12 -0700
Subject: [PATCH 2/5] Fix EVAL_ONLY: use full health check before eval
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In eval-only mode the benchmark stage is skipped, which also skips
its model health check. The 30s port check in _run_post_eval is
insufficient — workers are still loading. Use wait_for_model() with
the full health check config (same as benchmark stage) when
EVAL_ONLY=true.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/srtctl/cli/do_sweep.py | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py
index 9f6e5777c..568a8abc9 100644
--- a/src/srtctl/cli/do_sweep.py
+++ b/src/srtctl/cli/do_sweep.py
@@ -180,13 +180,36 @@ def _print_connection_info(self) -> None:
         logger.info("")
 
     def _run_post_eval(self, stop_event: threading.Event) -> int:
-        """Run lm-eval after the main benchmark completes."""
+        """Run lm-eval after the main benchmark completes (or directly in eval-only mode)."""
         from srtctl.benchmarks import get_runner
-
-        # Health check: verify server is still up
-        if not wait_for_port(self.runtime.nodes.head, 8000, timeout=30):
-            logger.error("Server health check failed before eval - skipping")
-            return 1
+        from srtctl.core.health import wait_for_model
+
+        # In eval-only mode the benchmark health check was skipped, so do the
+        # full model-ready wait here.  In post-benchmark mode a quick port
+        # check is sufficient since the server already served traffic.
+        if os.environ.get("EVAL_ONLY", "false").lower() == "true":
+            r = self.config.resources
+            n_prefill = 0 if r.num_agg > 0 else r.num_prefill
+            n_decode = r.num_agg if r.num_agg > 0 else r.num_decode
+            hc = self.config.health_check
+            logger.info("EVAL_ONLY: Waiting for server health before eval...")
+            if not wait_for_model(
+                host=self.runtime.nodes.head,
+                port=8000,
+                n_prefill=n_prefill,
+                n_decode=n_decode,
+                poll_interval=float(hc.interval_seconds),
+                timeout=float(hc.max_attempts * hc.interval_seconds),
+                report_every=60.0,
+                frontend_type=self.config.frontend.type,
+                stop_event=stop_event,
+            ):
+                logger.error("Server did not become healthy for eval")
+                return 1
+        else:
+            if not wait_for_port(self.runtime.nodes.head, 8000, timeout=30):
+                logger.error("Server health check failed before eval - skipping")
+                return 1
 
         try:
             runner = get_runner("lm-eval")

From 211c454a73b447c2a65a5f17264b74f0f5a876f0 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 15 Mar 2026 07:43:11 -0700
Subject: [PATCH 3/5] Use max benchmark concurrency for eval instead of --limit

Instead of capping eval examples with --limit to avoid timeouts,
use the highest benchmark concurrency for eval requests. This runs
the full eval set faster by matching the throughput the server was
already benchmarked at.

do_sweep.py computes max(config.benchmark.concurrencies) and passes
it as EVAL_CONC to the lm-eval bench script.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 docs/accuracy.md                               |  4 ++--
 src/srtctl/benchmarks/scripts/lm-eval/bench.sh |  9 +++++----
 src/srtctl/cli/do_sweep.py                     | 12 ++++++++++++
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/docs/accuracy.md b/docs/accuracy.md
index 768c6cfd4..d6dc975d7 100644
--- a/docs/accuracy.md
+++ b/docs/accuracy.md
@@ -213,7 +213,7 @@ srt-slurm supports an `EVAL_ONLY` mode that skips the throughput benchmark entir
 |---------|-------------|
 | `EVAL_ONLY` | Set to `true` to skip the throughput benchmark stage and run eval only |
 | `RUN_EVAL` | Set to `true` to run eval after the throughput benchmark completes |
-| `EVAL_CONC` | Concurrent requests for lm-eval (defaults to 256) |
+| `EVAL_CONC` | Concurrent requests for lm-eval (set by InferenceX to median of conc list; defaults to 256 if unset) |
 
 When `EVAL_ONLY=true`:
 - **Stage 4 (Benchmark)** is skipped entirely — no throughput test runs
@@ -240,7 +240,7 @@ Eval concurrency is set via the `EVAL_CONCURRENT_REQUESTS` environment variable
 export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-256}"
 ```
 
-The InferenceX workflow derives `EVAL_CONC` from the highest value in the benchmark concurrency list.
+The InferenceX workflow sets `EVAL_CONC` to the median of the benchmark concurrency list (chosen in `mark_eval_entries`). If `EVAL_CONC` is not set in the environment, `do_sweep.py` falls back to the max of the benchmark concurrency list.
 
 ### Output
 
diff --git a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
index bc251d559..880c974ca 100755
--- a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
+++ b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
@@ -39,10 +39,11 @@ cd "${INFMAX_WORKSPACE}"
 source "${INFMAX_WORKSPACE}/benchmarks/benchmark_lib.sh"
 
 # Run lm-eval via benchmark_lib
-EVAL_CONC=256
-EVAL_LIMIT="${EVAL_LIMIT:-100}"
-echo "Running lm-eval with concurrent-requests=${EVAL_CONC}, limit=${EVAL_LIMIT}..."
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests "$EVAL_CONC" --limit "$EVAL_LIMIT"
+# EVAL_CONC is set by the InferenceX workflow (median of conc list).
+# benchmark_lib reads concurrency from EVAL_CONCURRENT_REQUESTS env var.
+export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-256}"
+echo "Running lm-eval with concurrent-requests=${EVAL_CONCURRENT_REQUESTS}..."
+run_eval --framework lm-eval --port "$PORT"
 
 # Set metadata env vars needed by append_lm_eval_summary
 # These are passed through from the InferenceX environment
diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py
index 568a8abc9..abc4a862b 100644
--- a/src/srtctl/cli/do_sweep.py
+++ b/src/srtctl/cli/do_sweep.py
@@ -239,6 +239,18 @@ def _run_post_eval(self, stop_event: threading.Event) -> int:
         env_to_set["MODEL_NAME"] = self.config.served_model_name
         logger.info("Eval MODEL_NAME: %s", env_to_set["MODEL_NAME"])
 
+        # Use EVAL_CONC from workflow (median chosen by InferenceX mark_eval_entries),
+        # falling back to max of benchmark concurrency list.
+        eval_conc = os.environ.get("EVAL_CONC")
+        if eval_conc:
+            env_to_set["EVAL_CONC"] = eval_conc
+            logger.info("Eval concurrency (from workflow): %s", eval_conc)
+        else:
+            conc_list = self.config.benchmark.get_concurrency_list()
+            if conc_list:
+                env_to_set["EVAL_CONC"] = str(max(conc_list))
+                logger.info("Eval concurrency (max of %s): %s", conc_list, env_to_set["EVAL_CONC"])
+
         proc = start_srun_process(
             command=cmd,
             nodelist=[self.runtime.nodes.head],

From 702ff0078c12fea80e33bcefc395a9305e2bca4a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 6 Apr 2026 16:56:31 -0700
Subject: [PATCH 4/5] update docs, clean up code

---
 docs/accuracy.md                              | 63 +++++++++++++------
 .../benchmarks/scripts/lm-eval/bench.sh       | 29 ++++++---
 src/srtctl/cli/do_sweep.py                    | 63 ++++++++++++++-----
 3 files changed, 110 insertions(+), 45 deletions(-)

diff --git a/docs/accuracy.md b/docs/accuracy.md
index d6dc975d7..98b69b468 100644
--- a/docs/accuracy.md
+++ b/docs/accuracy.md
@@ -196,57 +196,80 @@ The output includes per-category scores and aggregate metrics:
 
 The `lm-eval` benchmark runner integrates [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) via InferenceX's `benchmark_lib.sh`. Unlike the built-in benchmarks above, this runner sources evaluation logic from an external InferenceX workspace mounted at `/infmax-workspace`.
 
-This is used by InferenceX CI to run graded QnA (gsm8k, gpqa) against multi-node deployments on GB200/GB300.
+This is used by InferenceX CI to run evals such as GSM8K and GPQA against NVIDIA multi-node disaggregated deployments on GB200, GB300, B200, B300, H100, and H200. AMD MI355X multi-node evals are handled by InferenceX's upstreamed AMD Slurm path, not by this srt-slurm runner.
+
+In InferenceX CI, recipes normally keep their throughput benchmark configuration. `do_sweep.py` invokes the registered `lm-eval` runner as a post-step when `RUN_EVAL=true`, or as the only benchmark-like step when `EVAL_ONLY=true`. There is no separate `infmax-eval` benchmark type.
 
 ### How it works
 
-1. The runner script (`benchmarks/scripts/lm-eval/bench.sh`) auto-discovers the served model name from `/v1/models`
-2. Sources `benchmark_lib.sh` from the InferenceX workspace
-3. Runs `run_eval` and `append_lm_eval_summary` from benchmark_lib
-4. Copies eval artifacts (`meta_env.json`, `results*.json`, `sample*.jsonl`) to `/logs/eval_results/`
+1. `RuntimeContext` mounts the host path from `INFMAX_WORKSPACE` at `/infmax-workspace` inside the Slurm container.
+2. `do_sweep.py` starts infrastructure, workers, and the frontend for the normal recipe topology.
+3. For `EVAL_ONLY=true`, `do_sweep.py` skips the throughput benchmark stage and runs `_run_post_eval()` directly after frontend startup.
+4. `_run_post_eval()` waits for the OpenAI-compatible endpoint on port 8000 and, in eval-only mode, performs the full `wait_for_model()` health check for the configured prefill/decode or aggregated topology.
+5. `_run_post_eval()` launches the registered `lm-eval` runner on the head node and passes through InferenceX metadata such as framework, precision, sequence length, prefill/decode topology, and eval concurrency.
+6. The runner script (`benchmarks/scripts/lm-eval/bench.sh`) uses `MODEL_NAME` from `do_sweep.py`, or auto-discovers the served model from `/v1/models` as a fallback.
+7. The runner sources `/infmax-workspace/benchmarks/benchmark_lib.sh`, runs `run_eval --framework lm-eval`, and calls `append_lm_eval_summary`.
+8. Eval artifacts are copied to `/logs/eval_results/` for InferenceX launcher-side artifact pickup.
 
 ### EVAL_ONLY mode
 
-srt-slurm supports an `EVAL_ONLY` mode that skips the throughput benchmark entirely and runs only the lm-eval evaluation. This is controlled via environment variables:
+srt-slurm supports an `EVAL_ONLY` mode for CI jobs that should only validate accuracy. This is controlled by environment variables from the InferenceX workflow:
 
 | Env var | Description |
 |---------|-------------|
 | `EVAL_ONLY` | Set to `true` to skip the throughput benchmark stage and run eval only |
 | `RUN_EVAL` | Set to `true` to run eval after the throughput benchmark completes |
-| `EVAL_CONC` | Concurrent requests for lm-eval (set by InferenceX to median of conc list; defaults to 256 if unset) |
+| `EVAL_CONC` | Concurrent requests for lm-eval, normally set by InferenceX from the generated `eval-conc` value |
+| `INFMAX_WORKSPACE` | Host path to the InferenceX checkout that should be mounted at `/infmax-workspace` |
+| `MODEL_NAME` | Served model alias for OpenAI-compatible requests; set by `do_sweep.py` from `config.served_model_name` |
 
 When `EVAL_ONLY=true`:
-- **Stage 4 (Benchmark)** is skipped entirely — no throughput test runs
-- **Health check** uses the full `wait_for_model()` check (polls for all prefill/decode workers to be ready) since the benchmark stage's health check was skipped
-- **Stage 5 (Eval)** runs `_run_post_eval()` which launches the lm-eval benchmark runner
-- Eval failure is **fatal** (non-zero exit) since eval is the only purpose of the job
+- Stage 4 skips the throughput benchmark entirely. No throughput result JSON is expected from srt-slurm.
+- The eval path uses the full `wait_for_model()` health check before starting lm-eval.
+- `_run_post_eval()` launches the `lm-eval` runner and returns its exit code.
+- Eval failure is fatal because eval is the only purpose of the job.
 
 When `RUN_EVAL=true` (without `EVAL_ONLY`):
 - Throughput benchmark runs normally
 - After benchmark completes successfully, eval runs as a post-step
-- Eval failure is **non-fatal** — the job still succeeds if throughput passed
+- Eval failure is non-fatal; the benchmark job still succeeds if throughput passed
 
 ### Environment variables
 
 The following env vars are passed through to the lm-eval runner container:
 
-`FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `RESULT_FILENAME`, `SPEC_DECODING`, `ISL`, `OSL`, `PREFILL_TP`, `PREFILL_EP`, `PREFILL_DP_ATTN`, `DECODE_TP`, `DECODE_EP`, `DECODE_DP_ATTN`, `MODEL_NAME`, `EVAL_CONC`, `EVAL_ONLY`, `RUN_EVAL`
+| Env var | Purpose |
+|---------|---------|
+| `RUN_EVAL`, `EVAL_ONLY`, `IS_MULTINODE` | Control whether eval runs and how InferenceX classifies the artifact |
+| `FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `SPEC_DECODING` | Benchmark identity metadata for `meta_env.json` |
+| `ISL`, `OSL`, `RESULT_FILENAME` | Sequence length and result-file metadata |
+| `MODEL`, `MODEL_PATH`, `MODEL_NAME` | Model metadata and the served model alias used for requests |
+| `MAX_MODEL_LEN`, `EVAL_MAX_MODEL_LEN` | Context-length metadata used by InferenceX eval helpers when available |
+| `PREFILL_TP`, `PREFILL_EP`, `PREFILL_NUM_WORKERS`, `PREFILL_DP_ATTN` | Prefill-side topology metadata |
+| `DECODE_TP`, `DECODE_EP`, `DECODE_NUM_WORKERS`, `DECODE_DP_ATTN` | Decode-side topology metadata |
+| `EVAL_CONC`, `EVAL_CONCURRENT_REQUESTS` | Eval concurrency controls |
+
+The runner maps srt-slurm's `PREFILL_DP_ATTN` and `DECODE_DP_ATTN` names to InferenceX's `PREFILL_DP_ATTENTION` and `DECODE_DP_ATTENTION` names before calling `append_lm_eval_summary`. This is required for multi-node summary tables to preserve prefill/decode DPA state.
 
 ### Concurrency
 
-Eval concurrency is set via the `EVAL_CONCURRENT_REQUESTS` environment variable (read by `benchmark_lib.sh`). The runner script sets this from `EVAL_CONC`:
+Eval concurrency is ultimately read by InferenceX's `benchmark_lib.sh` from `EVAL_CONCURRENT_REQUESTS`. The runner script sets that value from `EVAL_CONC` when present, preserves an existing `EVAL_CONCURRENT_REQUESTS` otherwise, and falls back to `256` only if neither variable is set:
 
 ```bash
-export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-256}"
+export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-256}}"
 ```
 
-The InferenceX workflow sets `EVAL_CONC` to the median of the benchmark concurrency list (chosen in `mark_eval_entries`). If `EVAL_CONC` is not set in the environment, `do_sweep.py` falls back to the max of the benchmark concurrency list.
+The InferenceX workflow sets `EVAL_CONC` from the generated `eval-conc` value. For multi-node configs, InferenceX selects the `8k1k` entry with the highest max eligible concurrency for each `(model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn)` group, then sets `eval-conc` to the upper median of that config's eligible concurrency list. If `EVAL_CONC` is not set in the environment, `do_sweep.py` falls back to the max of the recipe benchmark concurrency list.
 
 ### Output
 
 Eval artifacts are written to `/logs/eval_results/` inside the container:
-- `meta_env.json` — metadata (TP, conc, framework, precision, etc.)
-- `results*.json` — lm-eval scores per task
-- `sample*.jsonl` — per-sample outputs
+- `meta_env.json` - metadata used by InferenceX aggregation and summary tables
+- `results*.json` - lm-eval scores per task
+- `sample*.jsonl` - per-sample outputs
+
+These are collected by the InferenceX NVIDIA launch scripts and uploaded as workflow artifacts. In eval-only mode the InferenceX workflow expects eval artifacts, not throughput benchmark artifacts.
 
-These are collected by the InferenceX launch scripts (`launch_gb200-nv.sh`, `launch_gb300-nv.sh`) and uploaded as workflow artifacts.
+### Intricacies
+1. Eval floor of 16
+  - There is 1 sweep config of conc: [1], which causes evals to take >4hrs to complete.
diff --git a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
index 880c974ca..1cd47b8ad 100755
--- a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
+++ b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
@@ -41,20 +41,26 @@ source "${INFMAX_WORKSPACE}/benchmarks/benchmark_lib.sh"
 # Run lm-eval via benchmark_lib
 # EVAL_CONC is set by the InferenceX workflow (median of conc list).
 # benchmark_lib reads concurrency from EVAL_CONCURRENT_REQUESTS env var.
-export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-256}"
+export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-256}}"
 echo "Running lm-eval with concurrent-requests=${EVAL_CONCURRENT_REQUESTS}..."
-run_eval --framework lm-eval --port "$PORT"
+eval_rc=0
+run_eval --framework lm-eval --port "$PORT" || eval_rc=$?
 
 # Set metadata env vars needed by append_lm_eval_summary
 # These are passed through from the InferenceX environment
+export IS_MULTINODE="${IS_MULTINODE:-true}"
 export TP="${TP:-${PREFILL_TP:-1}}"
-export CONC="${CONC:-${EVAL_CONC}}"
-export EP_SIZE="${EP_SIZE:-1}"
-if [[ "${PREFILL_EP:-false}" == "true" ]]; then
-    EP_SIZE="${PREFILL_TP:-1}"
-fi
-export EP_SIZE
+export CONC="${CONC:-${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-1}}}"
+export EP_SIZE="${EP_SIZE:-${PREFILL_EP:-1}}"
+export PREFILL_TP="${PREFILL_TP:-${TP:-1}}"
+export PREFILL_EP="${PREFILL_EP:-${EP_SIZE:-1}}"
+export PREFILL_NUM_WORKERS="${PREFILL_NUM_WORKERS:-1}"
+export DECODE_TP="${DECODE_TP:-${TP:-1}}"
+export DECODE_EP="${DECODE_EP:-${EP_SIZE:-1}}"
+export DECODE_NUM_WORKERS="${DECODE_NUM_WORKERS:-1}"
 export DP_ATTENTION="${DP_ATTENTION:-${PREFILL_DP_ATTN:-false}}"
+export PREFILL_DP_ATTENTION="${PREFILL_DP_ATTENTION:-${PREFILL_DP_ATTN:-${DP_ATTENTION:-false}}}"
+export DECODE_DP_ATTENTION="${DECODE_DP_ATTENTION:-${DECODE_DP_ATTN:-${DP_ATTENTION:-false}}}"
 export ISL="${ISL:-}"
 export OSL="${OSL:-}"
 export FRAMEWORK="${FRAMEWORK:-}"
@@ -65,7 +71,7 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}"
 
 # Generate the lm-eval summary
 echo "Generating lm-eval summary..."
-append_lm_eval_summary
+append_lm_eval_summary || true
 
 # Copy eval artifacts to /logs/eval_results/
 mkdir -p /logs/eval_results
@@ -74,4 +80,9 @@ cp -v meta_env.json /logs/eval_results/ 2>/dev/null || true
 cp -v results*.json /logs/eval_results/ 2>/dev/null || true
 cp -v sample*.jsonl /logs/eval_results/ 2>/dev/null || true
 
+if [[ "$eval_rc" -ne 0 ]]; then
+    echo "lm-eval evaluation failed with exit code ${eval_rc}"
+    exit "$eval_rc"
+fi
+
 echo "lm-eval evaluation complete"
diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py
index abc4a862b..0cf429ecd 100644
--- a/src/srtctl/cli/do_sweep.py
+++ b/src/srtctl/cli/do_sweep.py
@@ -223,12 +223,34 @@ def _run_post_eval(self, stop_event: threading.Event) -> int:
         logger.info("Eval command: %s", " ".join(cmd))
         logger.info("Eval log: %s", eval_log)
 
-        # Pass through eval-related env vars
+        # Pass through eval-related env vars. InferenceX writes multi-node
+        # metadata from these variables in append_lm_eval_summary().
         env_to_set = {}
-        for var in ["RUN_EVAL", "FRAMEWORK", "PRECISION", "MODEL_PREFIX", "RUNNER_TYPE",
-                    "RESULT_FILENAME", "SPEC_DECODING", "ISL", "OSL",
-                    "PREFILL_TP", "PREFILL_EP", "PREFILL_DP_ATTN",
-                    "DECODE_TP", "DECODE_EP", "DECODE_DP_ATTN"]:
+        for var in [
+            "RUN_EVAL",
+            "EVAL_ONLY",
+            "IS_MULTINODE",
+            "FRAMEWORK",
+            "PRECISION",
+            "MODEL_PREFIX",
+            "RUNNER_TYPE",
+            "RESULT_FILENAME",
+            "SPEC_DECODING",
+            "ISL",
+            "OSL",
+            "MODEL",
+            "MODEL_PATH",
+            "MAX_MODEL_LEN",
+            "EVAL_MAX_MODEL_LEN",
+            "PREFILL_TP",
+            "PREFILL_EP",
+            "PREFILL_DP_ATTN",
+            "PREFILL_NUM_WORKERS",
+            "DECODE_TP",
+            "DECODE_EP",
+            "DECODE_DP_ATTN",
+            "DECODE_NUM_WORKERS",
+        ]:
             val = os.environ.get(var)
             if val:
                 env_to_set[var] = val
@@ -317,18 +339,27 @@ def run(self) -> int:
 
             self._print_connection_info()
 
-            # Stage 4: Benchmark (status reported AFTER health check passes)
-            exit_code = self.run_benchmark(registry, stop_event, reporter)
-
-            # Stage 5: Post-benchmark eval (optional, non-fatal)
-            if os.environ.get("RUN_EVAL", "false").lower() == "true" and exit_code == 0:
-                reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running post-benchmark evaluation")
-                logger.info("RUN_EVAL=true: Running post-benchmark lm-eval evaluation...")
-                eval_exit = self._run_post_eval(stop_event)
-                if eval_exit != 0:
-                    logger.warning("Eval failed with exit code %d (benchmark result is still valid)", eval_exit)
+            if os.environ.get("EVAL_ONLY", "false").lower() == "true":
+                reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running eval-only evaluation")
+                logger.info("EVAL_ONLY=true: Skipping benchmark stage and running lm-eval evaluation...")
+                exit_code = self._run_post_eval(stop_event)
+                if exit_code != 0:
+                    logger.error("Eval-only evaluation failed with exit code %d", exit_code)
                 else:
-                    logger.info("Post-benchmark eval completed successfully")
+                    logger.info("Eval-only evaluation completed successfully")
+            else:
+                # Stage 4: Benchmark (status reported AFTER health check passes)
+                exit_code = self.run_benchmark(registry, stop_event, reporter)
+
+                # Stage 5: Post-benchmark eval (optional, non-fatal)
+                if os.environ.get("RUN_EVAL", "false").lower() == "true" and exit_code == 0:
+                    reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running post-benchmark evaluation")
+                    logger.info("RUN_EVAL=true: Running post-benchmark lm-eval evaluation...")
+                    eval_exit = self._run_post_eval(stop_event)
+                    if eval_exit != 0:
+                        logger.warning("Eval failed with exit code %d (benchmark result is still valid)", eval_exit)
+                    else:
+                        logger.info("Post-benchmark eval completed successfully")
 
         except Exception as e:
             logger.exception("Error during sweep: %s", e)

From 4fc6e27ffe7425db92a2079dc6863aa9b192dff4 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 6 Apr 2026 18:02:38 -0700
Subject: [PATCH 5/5] Clean up

---
 src/srtctl/benchmarks/lm_eval.py              |  4 ----
 .../benchmarks/scripts/lm-eval/bench.sh       | 21 +++----------------
 2 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/src/srtctl/benchmarks/lm_eval.py b/src/srtctl/benchmarks/lm_eval.py
index b6fae05f0..7667e97fd 100644
--- a/src/srtctl/benchmarks/lm_eval.py
+++ b/src/srtctl/benchmarks/lm_eval.py
@@ -1,11 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
 """lm-eval benchmark runner for InferenceX evals."""
 
 from __future__ import annotations
 
-import os
 from typing import TYPE_CHECKING
 
 from srtctl.benchmarks.base import SCRIPTS_DIR, BenchmarkRunner, register_benchmark
diff --git a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
index 1cd47b8ad..8a03333b3 100755
--- a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
+++ b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
@@ -1,7 +1,4 @@
 #!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
 # lm-eval accuracy evaluation using InferenceX benchmark_lib
 # Expects: endpoint [infmax_workspace]
 
@@ -46,28 +43,16 @@ echo "Running lm-eval with concurrent-requests=${EVAL_CONCURRENT_REQUESTS}..."
 eval_rc=0
 run_eval --framework lm-eval --port "$PORT" || eval_rc=$?
 
-# Set metadata env vars needed by append_lm_eval_summary
-# These are passed through from the InferenceX environment
+# Derive metadata env vars that append_lm_eval_summary needs but do_sweep.py
+# does not pass directly (it passes PREFILL_TP/EP/etc, not TP/EP_SIZE/CONC).
 export IS_MULTINODE="${IS_MULTINODE:-true}"
 export TP="${TP:-${PREFILL_TP:-1}}"
 export CONC="${CONC:-${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-1}}}"
 export EP_SIZE="${EP_SIZE:-${PREFILL_EP:-1}}"
-export PREFILL_TP="${PREFILL_TP:-${TP:-1}}"
-export PREFILL_EP="${PREFILL_EP:-${EP_SIZE:-1}}"
-export PREFILL_NUM_WORKERS="${PREFILL_NUM_WORKERS:-1}"
-export DECODE_TP="${DECODE_TP:-${TP:-1}}"
-export DECODE_EP="${DECODE_EP:-${EP_SIZE:-1}}"
-export DECODE_NUM_WORKERS="${DECODE_NUM_WORKERS:-1}"
 export DP_ATTENTION="${DP_ATTENTION:-${PREFILL_DP_ATTN:-false}}"
+# Remap srt-slurm's DP_ATTN names to InferenceX's DP_ATTENTION names
 export PREFILL_DP_ATTENTION="${PREFILL_DP_ATTENTION:-${PREFILL_DP_ATTN:-${DP_ATTENTION:-false}}}"
 export DECODE_DP_ATTENTION="${DECODE_DP_ATTENTION:-${DECODE_DP_ATTN:-${DP_ATTENTION:-false}}}"
-export ISL="${ISL:-}"
-export OSL="${OSL:-}"
-export FRAMEWORK="${FRAMEWORK:-}"
-export PRECISION="${PRECISION:-}"
-export MODEL_PREFIX="${MODEL_PREFIX:-}"
-export RUNNER_TYPE="${RUNNER_TYPE:-}"
-export RESULT_FILENAME="${RESULT_FILENAME:-}"
 
 # Generate the lm-eval summary
 echo "Generating lm-eval summary..."