SemiAnalysisAI · Oseltamivir · Mar 28, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -54,6 +54,11 @@ on:
         type: boolean
         required: true
         default: false
+      eval-only:
+        description: "Run only evals (skip throughput benchmark)"
+        type: boolean
+        required: false
+        default: false
       random-range-ratio:
         required: false
         type: string
@@ -83,6 +88,7 @@ env:
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
+  EVAL_ONLY: ${{ inputs.eval-only }}
 
 permissions:
   contents: read
@@ -91,7 +97,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 300
-    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.run-eval && ' | eval' || '' }}"
+    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}"
     steps:
       - name: Resource cleanup (pre-run)
         run: &resource-cleanup |
@@ -145,28 +151,35 @@ jobs:
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
 
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
-          FOUND_RESULT_FILE=
-          for i in {1..10}; do
-            if [ -f "$RESULT_FILENAME.json" ]; then
-              FOUND_RESULT_FILE=true
-              break
-            fi
-            echo "Waiting for result file... (attempt $i)"
-            sleep 1
-          done
 
-          if [ -z "$FOUND_RESULT_FILE" ]; then
-            echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
-            exit 1
+          if [ "${{ inputs.eval-only }}" = "true" ]; then
+            echo "Eval-only mode: skipping benchmark result file check"
+          else
+            FOUND_RESULT_FILE=
+            for i in {1..10}; do
+              if [ -f "$RESULT_FILENAME.json" ]; then
+                FOUND_RESULT_FILE=true
+                break
+              fi
+              echo "Waiting for result file... (attempt $i)"
+              sleep 1
+            done
+
+            if [ -z "$FOUND_RESULT_FILE" ]; then
+              echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
+              exit 1
+            fi
           fi
 
       - name: Process result
+        if: ${{ !inputs.eval-only }}
         env:
           RUNNER_TYPE: ${{ inputs.runner }}
         run: |
           python3 utils/process_result.py
 
       - name: Upload result
+        if: ${{ !inputs.eval-only }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: bmk_${{ env.RESULT_FILENAME }}
@@ -189,7 +202,7 @@ jobs:
           if-no-files-found: ignore
 
       - name: Upload eval results (if any)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
@@ -200,7 +213,7 @@ jobs:
           if-no-files-found: ignore
 
       - name: Cleanup eval outputs (post-upload)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }}
         run: |
           rm -f meta_env.json || true
           # Remove any eval results JSONs that were moved into workspace

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
@@ -37,6 +37,7 @@ jobs:
         outputs:
             single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
             multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
+            eval-config: ${{ steps.get-jobs.outputs.eval-config }}
         steps:
             - name: Checkout code (ref)
               if: ${{ inputs.ref && inputs.ref != '' }}
@@ -53,10 +54,12 @@ jobs:
                   pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
-                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x]))")
+                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))")
                   MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))")
+                  EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))")
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
+                  echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
 
     test-sweep-multi-node:
         needs: get-jobs
@@ -123,7 +126,38 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
-            run-eval: ${{ matrix.config.run-eval }}
+            run-eval: false
+            ref: ${{ inputs.ref }}
+
+    test-sweep-evals:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.eval-config != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.eval-config) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            run-eval: true
+            eval-only: true
             ref: ${{ inputs.ref }}
 
     collect-results:
@@ -135,7 +169,7 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs: [test-sweep-multi-node, test-sweep-single-node]
+        needs: [test-sweep-evals]
         if: ${{ always() }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit

diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
@@ -183,6 +183,36 @@ jobs:
         secrets: inherit
         with: *single-node-inputs
 
+    sweep-evals:
+        needs: setup
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).evals }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            run-eval: true
+            eval-only: true
+
     collect-results:
         needs:
             [
@@ -201,16 +231,7 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs:
-            [
-                sweep-single-node-1k1k,
-                sweep-single-node-1k8k,
-                sweep-single-node-8k1k,
-                sweep-multi-node-1k1k,
-                sweep-multi-node-1k8k,
-                sweep-multi-node-8k1k,
-                setup,
-            ]
+        needs: [sweep-evals, setup]
         if: ${{ always() && needs.setup.result != 'skipped' }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit

diff --git a/AGENTS.md b/AGENTS.md
@@ -307,7 +307,7 @@ Evals are **off by default** (`RUN_EVAL=false`). When enabled, they run for two
 
 This selection logic is in `mark_eval_entries()` in `utils/matrix_logic/generate_sweep_configs.py`.
 
-**Note**: Evals only run on `1k8k` sequence length.
+**Note**: Evals only run on `8k1k` sequence length.
 
 ### Eval Framework: lm-eval
 

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -174,6 +174,12 @@ wait_for_server_ready() {
 #   --trust-remote-code: Optional flag to trust remote code from HuggingFace
 #   --server-pid: Optional server process ID to monitor during benchmark
 run_benchmark_serving() {
+    # In eval-only mode, skip the throughput benchmark entirely.
+    if [ "${EVAL_ONLY}" = "true" ]; then
+        echo "EVAL_ONLY mode: skipping throughput benchmark"
+        return 0
+    fi
+
     set +x
     local model=""
     local port=""
@@ -486,6 +492,9 @@ move_profile_trace_for_relay() {
 # ------------------------------
 
 _install_lm_eval_deps() {
+    # Remove torchvision to avoid circular import issues in ATOM containers.
+    # lm_eval[api] uses local-chat-completions (API-based) and does not need it.
+    python3 -m pip uninstall -y torchvision 2>/dev/null || true
     python3 -m pip install -q --no-cache-dir --break-system-packages "lm-eval[api]" || true
     local lm_eval_ref="b315ef3b05176acc9732bb7fdec116abe1ecc476"
     if command -v git >/dev/null 2>&1; then
@@ -574,26 +583,56 @@ PY
     export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
 }
 
+get_native_max_context_length() {
+    local model_path="$1"
+    python3 -c "
+from transformers import AutoConfig
+config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True)
+for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']:
+    if hasattr(config, attr):
+        print(getattr(config, attr))
+        break
+"
+}
+
+# Compute the context length for eval-only mode.
+# Uses 5x the benchmark context capped at the model's native max.
+# Exports EVAL_MAX_MODEL_LEN (needed by run_lm_eval).
+# Echoes the computed value for scripts to capture.
+#
+# Usage: local ctx=$(compute_eval_context_length "$MODEL" "${current_ctx}")
+compute_eval_context_length() {
+    local model="$1"
+    local benchmark_ctx="${2:-0}"
+    local native_max
+    native_max=$(get_native_max_context_length "$model")
+
+    local eval_ctx=$(( ${benchmark_ctx:-$native_max} * 5 ))
+    if [ "$eval_ctx" -gt "$native_max" ]; then
+        eval_ctx="$native_max"
+    fi
+    export EVAL_MAX_MODEL_LEN="$eval_ctx"
+    echo "$eval_ctx"
+}
+
 run_lm_eval() {
     local port="${PORT:-8888}"
-    local task="${EVAL_TASK:-gsm8k}"
-    local num_fewshot="${NUM_FEWSHOT:-2}"
+    local tasks_dir="${EVAL_TASKS_DIR:-utils/evals/gsm8k.yaml}" 
     local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
-    local gen_max_tokens=16384
+    local gen_max_tokens="${EVAL_MAX_MODEL_LEN:-16384}"
     local temperature=0
     local top_p=1
-    local concurrent_requests=32
+    local concurrent_requests="${EVAL_CONCURRENT_REQUESTS:-64}"
 
     while [[ $# -gt 0 ]]; do
         case $1 in
             --port)           port="$2"; shift 2 ;;
-            --task)           task="$2"; shift 2 ;;
-            --num-fewshot)    num_fewshot="$2"; shift 2 ;;
+            --task)           tasks_dir="$2"; shift 2 ;;
             --results-dir)    results_dir="$2"; shift 2 ;;
             --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;;
             --temperature)    temperature="$2"; shift 2 ;;
             --top-p)          top_p="$2"; shift 2 ;;
-            --concurrent-requests) concurrent_requests="$2"; shift 2 ;;
+            --concurrent-requests) shift 2; continue ;; # ignored; use EVAL_CONCURRENT_REQUESTS env var
             *)                echo "Unknown parameter: $1"; return 1 ;;
         esac
     done
@@ -606,16 +645,19 @@ run_lm_eval() {
     export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
     MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL
 
+    # Cap generation tokens to avoid excessive KV cache reservation per request on TRT.
+    local max_gen_tokens=16384
+    echo "Eval context budget: max_length=${gen_max_tokens}, max_gen_tokens=${max_gen_tokens}"
+
     # Export for append_lm_eval_summary to pick up
     export EVAL_RESULT_DIR="$results_dir"
     set -x
     python3 -m lm_eval --model local-chat-completions --apply_chat_template \
-      --tasks "utils/evals/${task}.yaml" \
-      --num_fewshot "${num_fewshot}" \
+      --tasks "${tasks_dir}" \
       --output_path "${results_dir}" \
       --log_samples \
-      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},timeout=600,tokenized_requests=False,max_length=${gen_max_tokens}" \
-      --gen_kwargs "max_tokens=8192,temperature=${temperature},top_p=${top_p}"
+      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,tokenized_requests=False,max_length=${gen_max_tokens}" \
+      --gen_kwargs "max_tokens=${max_gen_tokens},temperature=${temperature},top_p=${top_p}"
     local eval_exit=$?
     set +x
     return $eval_exit
@@ -706,8 +748,14 @@ run_eval() {
         esac
     done
 
+    # Compute EVAL_MAX_MODEL_LEN if not already set by the calling script
+    if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then
+        compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null
+    fi
+
     case "$framework" in
         lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;;
         *)               echo "Unknown framework '${framework}'"; return 1 ;;
     esac
+    return $?
 }
diff --git a/benchmarks/single_node/dsr1_fp4_b200.sh b/benchmarks/single_node/dsr1_fp4_b200.sh
@@ -31,6 +31,10 @@ else
 fi
 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    EVAL_CONTEXT_ARGS="--context-length $(compute_eval_context_length "$MODEL" "$((ISL + OSL + 20))")"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -40,7 +44,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.
 --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 16384 \
 --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 &
+--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -63,7 +67,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi