Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
cfcaa84
feat: switch evals to 8k1k and restart server with native max context
Oseltamivir Mar 7, 2026
8d98b94
fix: cap gen_kwargs max_tokens to leave room for prompt
Oseltamivir Mar 7, 2026
4452868
fix: reserve 30% of context for prompt in eval gen_kwargs
Oseltamivir Mar 7, 2026
83cd7de
change eval calling
Oseltamivir Mar 9, 2026
161e7f6
fix: eval-only server wait and PEP 668 pip install
Oseltamivir Mar 9, 2026
73b6846
change gsm8k to 8-shot
Oseltamivir Mar 9, 2026
83e185d
refactor: decouple eval concurrency, cap gen tokens, fix eval config
Oseltamivir Mar 10, 2026
aad5336
decouple
Oseltamivir Mar 10, 2026
375f14e
fix: uninstall torchvision before lm_eval to fix ATOM container impor…
Oseltamivir Mar 11, 2026
1cdc72a
add gpqa
Oseltamivir Mar 11, 2026
49b0b90
fix: pass multiple eval tasks as separate args for older lm-eval compat
Oseltamivir Mar 11, 2026
cc59d50
fix: run eval tasks sequentially for cross-version lm-eval compat
Oseltamivir Mar 11, 2026
c8b5858
fix: use directory-based task discovery for multi-eval in single lm_e…
Oseltamivir Mar 11, 2026
ffee6c5
gsm8k only
Oseltamivir Mar 12, 2026
c577ca2
Merge branch 'main' into eval-8k1k-server-restart
Oseltamivir Mar 12, 2026
dc25ccd
fix: cap max_gen_tokens to server's max_model_len to avoid request re…
Oseltamivir Mar 12, 2026
285b662
fix: add eval context length override to remaining 18 scripts
Oseltamivir Mar 12, 2026
2d9d7ba
fix: default EVAL_TASKS_DIR to utils/evals directory, not single yaml…
Oseltamivir Mar 13, 2026
1ca2173
fix: reduce eval context multiplier to 5x and increase request timeou…
Oseltamivir Mar 13, 2026
7458eea
Merge branch 'main' into eval-8k1k-server-restart
Oseltamivir Mar 13, 2026
6999263
test other prompt
Oseltamivir Mar 15, 2026
ba45203
pr
Oseltamivir Mar 15, 2026
826035c
test evals
Oseltamivir Mar 15, 2026
34bc7c4
resolve claude issues
Oseltamivir Mar 15, 2026
4978aed
torchvision
Oseltamivir Mar 16, 2026
c038e1b
make stuff neater, ready for merge
Oseltamivir Mar 16, 2026
29d69fa
resolve issues, add --no-evals, change default to flag-less
Oseltamivir Mar 18, 2026
327fd6d
ctxt len
Oseltamivir Mar 18, 2026
6350b6b
resolve claude
Oseltamivir Mar 18, 2026
9ae1ae4
Merge branch 'main' into eval-8k1k-server-restart
Oseltamivir Mar 18, 2026
dec6f60
h200 change
Oseltamivir Mar 18, 2026
b08e063
final touches
Oseltamivir Mar 19, 2026
f04881d
test normal perf-changelog
Oseltamivir Mar 19, 2026
86764fa
Merge branch 'main' into eval-8k1k-server-restart
Oseltamivir Mar 19, 2026
c17619f
test normal perf-changelog
Oseltamivir Mar 20, 2026
d30f807
all evals
Oseltamivir Mar 20, 2026
766a742
remove pycache
Oseltamivir Mar 20, 2026
5d5dd7b
argmax error
Oseltamivir Mar 20, 2026
f54b09c
Merge branch 'main' into eval-8k1k-server-restart
Oseltamivir Mar 20, 2026
b9551f6
merge main
Oseltamivir Mar 21, 2026
f0fff18
blocking rm
Oseltamivir Mar 21, 2026
8bf41f0
Merge branch 'main' into eval-8k1k-server-restart
Oseltamivir Mar 21, 2026
38b80a8
standardize
Oseltamivir Mar 22, 2026
5beca55
reduce ctxt OOM
Oseltamivir Mar 22, 2026
4dcfc92
reduce ctxt OOM
Oseltamivir Mar 22, 2026
f74ad43
block size
Oseltamivir Mar 22, 2026
04c9e88
EACCES
Oseltamivir Mar 22, 2026
2f3a788
final
Oseltamivir Mar 23, 2026
99f5d21
remove sudo rm
Oseltamivir Mar 24, 2026
c23031b
Merge branch 'main' into eval-8k1k-server-restart
Oseltamivir Mar 24, 2026
ac655d3
Merge branch 'main' into eval-8k1k-server-restart
Oseltamivir Mar 26, 2026
bec4dba
Fix newline at end of file in perf-changelog.yaml
Oseltamivir Mar 26, 2026
f4c332b
Merge main
Oseltamivir Mar 27, 2026
87f711e
Merge branch 'main' into eval-8k1k-server-restart
Oseltamivir Mar 27, 2026
ed3b2fc
eliminate 1k8k
Oseltamivir Mar 28, 2026
194d52a
Merge branch 'main' into eval-8k1k-server-restart
Oseltamivir Mar 28, 2026
f5fd82c
changelog
Oseltamivir Mar 28, 2026
ecb4717
final
Oseltamivir Mar 28, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 28 additions & 15 deletions .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ on:
type: boolean
required: true
default: false
eval-only:
description: "Run only evals (skip throughput benchmark)"
type: boolean
required: false
default: false
random-range-ratio:
required: false
type: string
Expand Down Expand Up @@ -83,6 +88,7 @@ env:
SPEC_DECODING: ${{ inputs.spec-decoding }}
DISAGG: ${{ inputs.disagg }}
Comment thread
Oseltamivir marked this conversation as resolved.
RUN_EVAL: ${{ inputs.run-eval }}
EVAL_ONLY: ${{ inputs.eval-only }}

permissions:
contents: read
Expand All @@ -91,7 +97,7 @@ jobs:
benchmark:
runs-on: ${{ inputs.runner }}
timeout-minutes: 300
name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.run-eval && ' | eval' || '' }}"
name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}"
steps:
- name: Resource cleanup (pre-run)
run: &resource-cleanup |
Expand Down Expand Up @@ -145,28 +151,35 @@ jobs:
echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV

bash ./runners/launch_${RUNNER_NAME%%_*}.sh
FOUND_RESULT_FILE=
for i in {1..10}; do
if [ -f "$RESULT_FILENAME.json" ]; then
FOUND_RESULT_FILE=true
break
fi
echo "Waiting for result file... (attempt $i)"
sleep 1
done

if [ -z "$FOUND_RESULT_FILE" ]; then
echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
exit 1
if [ "${{ inputs.eval-only }}" = "true" ]; then
Comment thread
Oseltamivir marked this conversation as resolved.
echo "Eval-only mode: skipping benchmark result file check"
else
FOUND_RESULT_FILE=
for i in {1..10}; do
if [ -f "$RESULT_FILENAME.json" ]; then
FOUND_RESULT_FILE=true
break
fi
echo "Waiting for result file... (attempt $i)"
sleep 1
done

if [ -z "$FOUND_RESULT_FILE" ]; then
echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
exit 1
fi
fi

- name: Process result
if: ${{ !inputs.eval-only }}
env:
RUNNER_TYPE: ${{ inputs.runner }}
run: |
python3 utils/process_result.py

- name: Upload result
if: ${{ !inputs.eval-only }}
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: bmk_${{ env.RESULT_FILENAME }}
Expand All @@ -189,7 +202,7 @@ jobs:
if-no-files-found: ignore

- name: Upload eval results (if any)
if: ${{ env.RUN_EVAL == 'true' }}
if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }}
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
Comment thread
Oseltamivir marked this conversation as resolved.
Expand All @@ -200,7 +213,7 @@ jobs:
if-no-files-found: ignore

- name: Cleanup eval outputs (post-upload)
if: ${{ env.RUN_EVAL == 'true' }}
if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }}
run: |
rm -f meta_env.json || true
Comment thread
Oseltamivir marked this conversation as resolved.
# Remove any eval results JSONs that were moved into workspace
Comment thread
Oseltamivir marked this conversation as resolved.
Expand Down
40 changes: 37 additions & 3 deletions .github/workflows/e2e-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ jobs:
outputs:
single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
eval-config: ${{ steps.get-jobs.outputs.eval-config }}
steps:
- name: Checkout code (ref)
if: ${{ inputs.ref && inputs.ref != '' }}
Expand All @@ -53,10 +54,12 @@ jobs:
pip install pydantic
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x]))")
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))")
Comment thread
Oseltamivir marked this conversation as resolved.
MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))")
EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))")
echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
echo "eval-config=$EVALS" >> $GITHUB_OUTPUT

test-sweep-multi-node:
needs: get-jobs
Expand Down Expand Up @@ -123,7 +126,38 @@ jobs:
conc: ${{ matrix.config.conc }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
run-eval: ${{ matrix.config.run-eval }}
run-eval: false
ref: ${{ inputs.ref }}

test-sweep-evals:
needs: get-jobs
if: ${{ needs.get-jobs.outputs.eval-config != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: eval /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-jobs.outputs.eval-config) }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
run-eval: true
eval-only: true
ref: ${{ inputs.ref }}

collect-results:
Expand All @@ -135,7 +169,7 @@ jobs:
result-prefix: "bmk"

collect-evals:
needs: [test-sweep-multi-node, test-sweep-single-node]
needs: [test-sweep-evals]
if: ${{ always() }}
uses: ./.github/workflows/collect-evals.yml
secrets: inherit
Expand Down
41 changes: 31 additions & 10 deletions .github/workflows/run-sweep.yml
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,36 @@ jobs:
secrets: inherit
with: *single-node-inputs

sweep-evals:
needs: setup
if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: eval /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).evals }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
run-eval: true
eval-only: true

collect-results:
needs:
[
Expand All @@ -201,16 +231,7 @@ jobs:
result-prefix: "bmk"

collect-evals:
needs:
[
sweep-single-node-1k1k,
sweep-single-node-1k8k,
sweep-single-node-8k1k,
sweep-multi-node-1k1k,
sweep-multi-node-1k8k,
sweep-multi-node-8k1k,
setup,
]
needs: [sweep-evals, setup]
if: ${{ always() && needs.setup.result != 'skipped' }}
uses: ./.github/workflows/collect-evals.yml
secrets: inherit
Comment thread
Oseltamivir marked this conversation as resolved.
Expand Down
2 changes: 1 addition & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ Evals are **off by default** (`RUN_EVAL=false`). When enabled, they run for two

This selection logic is in `mark_eval_entries()` in `utils/matrix_logic/generate_sweep_configs.py`.

**Note**: Evals only run on `1k8k` sequence length.
**Note**: Evals only run on `8k1k` sequence length.

### Eval Framework: lm-eval

Expand Down
70 changes: 59 additions & 11 deletions benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,12 @@ wait_for_server_ready() {
# --trust-remote-code: Optional flag to trust remote code from HuggingFace
# --server-pid: Optional server process ID to monitor during benchmark
run_benchmark_serving() {
# In eval-only mode, skip the throughput benchmark entirely.
if [ "${EVAL_ONLY}" = "true" ]; then
echo "EVAL_ONLY mode: skipping throughput benchmark"
return 0
fi

set +x
local model=""
local port=""
Expand Down Expand Up @@ -486,6 +492,9 @@ move_profile_trace_for_relay() {
# ------------------------------

_install_lm_eval_deps() {
# Remove torchvision to avoid circular import issues in ATOM containers.
# lm_eval[api] uses local-chat-completions (API-based) and does not need it.
python3 -m pip uninstall -y torchvision 2>/dev/null || true
python3 -m pip install -q --no-cache-dir --break-system-packages "lm-eval[api]" || true
local lm_eval_ref="b315ef3b05176acc9732bb7fdec116abe1ecc476"
if command -v git >/dev/null 2>&1; then
Expand Down Expand Up @@ -574,26 +583,56 @@ PY
export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
}

get_native_max_context_length() {
local model_path="$1"
python3 -c "
from transformers import AutoConfig
config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True)
for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']:
if hasattr(config, attr):
print(getattr(config, attr))
break
"
}

# Compute the context length for eval-only mode.
# Uses 5x the benchmark context capped at the model's native max.
# Exports EVAL_MAX_MODEL_LEN (needed by run_lm_eval).
# Echoes the computed value for scripts to capture.
#
# Usage: local ctx=$(compute_eval_context_length "$MODEL" "${current_ctx}")
compute_eval_context_length() {
local model="$1"
local benchmark_ctx="${2:-0}"
local native_max
native_max=$(get_native_max_context_length "$model")

local eval_ctx=$(( ${benchmark_ctx:-$native_max} * 5 ))
if [ "$eval_ctx" -gt "$native_max" ]; then
eval_ctx="$native_max"
fi
Comment thread
Oseltamivir marked this conversation as resolved.
export EVAL_MAX_MODEL_LEN="$eval_ctx"
echo "$eval_ctx"
}
Comment thread
Oseltamivir marked this conversation as resolved.

run_lm_eval() {
local port="${PORT:-8888}"
local task="${EVAL_TASK:-gsm8k}"
local num_fewshot="${NUM_FEWSHOT:-2}"
local tasks_dir="${EVAL_TASKS_DIR:-utils/evals/gsm8k.yaml}"
local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
local gen_max_tokens=16384
local gen_max_tokens="${EVAL_MAX_MODEL_LEN:-16384}"
local temperature=0
local top_p=1
local concurrent_requests=32
local concurrent_requests="${EVAL_CONCURRENT_REQUESTS:-64}"

while [[ $# -gt 0 ]]; do
case $1 in
--port) port="$2"; shift 2 ;;
--task) task="$2"; shift 2 ;;
--num-fewshot) num_fewshot="$2"; shift 2 ;;
--task) tasks_dir="$2"; shift 2 ;;
--results-dir) results_dir="$2"; shift 2 ;;
--gen-max-tokens) gen_max_tokens="$2"; shift 2 ;;
--temperature) temperature="$2"; shift 2 ;;
--top-p) top_p="$2"; shift 2 ;;
--concurrent-requests) concurrent_requests="$2"; shift 2 ;;
--concurrent-requests) shift 2; continue ;; # ignored; use EVAL_CONCURRENT_REQUESTS env var
*) echo "Unknown parameter: $1"; return 1 ;;
Comment thread
Oseltamivir marked this conversation as resolved.
esac
done
Expand All @@ -606,16 +645,19 @@ run_lm_eval() {
export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL

# Cap generation tokens to avoid excessive KV cache reservation per request on TRT.
local max_gen_tokens=16384
echo "Eval context budget: max_length=${gen_max_tokens}, max_gen_tokens=${max_gen_tokens}"

# Export for append_lm_eval_summary to pick up
export EVAL_RESULT_DIR="$results_dir"
set -x
python3 -m lm_eval --model local-chat-completions --apply_chat_template \
--tasks "utils/evals/${task}.yaml" \
--num_fewshot "${num_fewshot}" \
--tasks "${tasks_dir}" \
--output_path "${results_dir}" \
--log_samples \
--model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},timeout=600,tokenized_requests=False,max_length=${gen_max_tokens}" \
--gen_kwargs "max_tokens=8192,temperature=${temperature},top_p=${top_p}"
--model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,tokenized_requests=False,max_length=${gen_max_tokens}" \
--gen_kwargs "max_tokens=${max_gen_tokens},temperature=${temperature},top_p=${top_p}"
local eval_exit=$?
set +x
return $eval_exit
Expand Down Expand Up @@ -706,8 +748,14 @@ run_eval() {
esac
done

# Compute EVAL_MAX_MODEL_LEN if not already set by the calling script
if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then
compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null
fi

case "$framework" in
lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;;
*) echo "Unknown framework '${framework}'"; return 1 ;;
esac
return $?
}
8 changes: 6 additions & 2 deletions benchmarks/single_node/dsr1_fp4_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ else
fi
echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
EVAL_CONTEXT_ARGS="--context-length $(compute_eval_context_length "$MODEL" "$((ISL + OSL + 20))")"
fi
Comment thread
Oseltamivir marked this conversation as resolved.
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

Expand All @@ -40,7 +44,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.
--cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
--chunked-prefill-size 16384 \
--ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 &
--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand All @@ -63,7 +67,7 @@ run_benchmark_serving \

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

Expand Down
Loading