Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,16 @@ on:
description: "Git ref (branch/sha) to checkout"
required: false
type: string
srt-slurm-repo:
description: "Override srt-slurm clone URL (leave empty to use launcher default)"
required: false
type: string
default: ""
srt-slurm-ref:
description: "Override srt-slurm git ref (branch/sha; leave empty to use launcher default)"
required: false
type: string
default: ""

env:
RANDOM_RANGE_RATIO: 0.8
Expand Down Expand Up @@ -126,6 +136,11 @@ env:
DECODE_EP: ${{ inputs.decode-ep }}
DECODE_DP_ATTN: ${{ inputs.decode-dp-attn }}

# Optional override for which srt-slurm repo/ref the launcher clones.
# Leave empty to use the launcher's built-in defaults per framework.
SRT_SLURM_REPO: ${{ inputs.srt-slurm-repo }}
SRT_SLURM_REF: ${{ inputs.srt-slurm-ref }}

permissions:
contents: read

Expand Down
24 changes: 24 additions & 0 deletions .github/workflows/e2e-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@ on:
description: "Ref (branch/sha) to checkout for generating configs"
required: false
type: string
srt-slurm-repo:
description: "Override srt-slurm clone URL (leave empty to use launcher default)"
required: false
type: string
default: ""
srt-slurm-ref:
description: "Override srt-slurm git ref (branch/sha; leave empty to use launcher default)"
required: false
type: string
default: ""
workflow_call:
inputs:
generate-cli-command:
Expand All @@ -30,6 +40,16 @@ on:
description: "Ref (branch/sha) to checkout for generating configs"
required: false
type: string
srt-slurm-repo:
description: "Override srt-slurm clone URL (leave empty to use launcher default)"
required: false
type: string
default: ""
srt-slurm-ref:
description: "Override srt-slurm git ref (branch/sha; leave empty to use launcher default)"
required: false
type: string
default: ""

jobs:
get-jobs:
Expand Down Expand Up @@ -102,6 +122,8 @@ jobs:
decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
run-eval: false
ref: ${{ inputs.ref }}
srt-slurm-repo: ${{ inputs.srt-slurm-repo }}
srt-slurm-ref: ${{ inputs.srt-slurm-ref }}

test-sweep-multi-node-evals:
needs: get-jobs
Expand Down Expand Up @@ -143,6 +165,8 @@ jobs:
eval-only: true
eval-conc: ${{ matrix.config.eval-conc }}
ref: ${{ inputs.ref }}
srt-slurm-repo: ${{ inputs.srt-slurm-repo }}
srt-slurm-ref: ${{ inputs.srt-slurm-ref }}

test-sweep-single-node:
needs: get-jobs
Expand Down
30 changes: 20 additions & 10 deletions runners/launch_gb200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -134,20 +134,27 @@ if [ -d "$SRT_REPO_DIR" ]; then
rm -rf "$SRT_REPO_DIR"
fi

# Allow SRT_SLURM_REPO / SRT_SLURM_REF to override the default clone source
# (useful for testing WIP branches like the generalized lm-eval-main).
if [[ $FRAMEWORK == "dynamo-vllm" ]]; then
git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
git checkout sa-submission-q2-2026
DEFAULT_SRT_REPO="https://github.com/NVIDIA/srt-slurm.git"
DEFAULT_SRT_REF="sa-submission-q2-2026"
elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]]; then
git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
git checkout sa-submission-q2-2026
DEFAULT_SRT_REPO="https://github.com/NVIDIA/srt-slurm.git"
DEFAULT_SRT_REF="sa-submission-q2-2026"
else
git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
git checkout sa-submission-q1-2026
DEFAULT_SRT_REPO="https://github.com/ishandhanani/srt-slurm.git"
DEFAULT_SRT_REF="sa-submission-q1-2026"
fi

SRT_SLURM_REPO="${SRT_SLURM_REPO:-$DEFAULT_SRT_REPO}"
SRT_SLURM_REF="${SRT_SLURM_REF:-$DEFAULT_SRT_REF}"

echo "Cloning ${SRT_SLURM_REPO} @ ${SRT_SLURM_REF}"
git clone "$SRT_SLURM_REPO" "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
git checkout "$SRT_SLURM_REF"

echo "Installing srtctl..."
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
Expand Down Expand Up @@ -197,7 +204,10 @@ cat srtslurm.yaml
echo "Running make setup..."
make setup ARCH=aarch64

# Export eval-related env vars for srt-slurm post-benchmark eval
# Export eval-related env vars for srt-slurm post-benchmark eval.
# LM_EVAL_WORKSPACE is what the generalized srt-slurm reads; INFMAX_WORKSPACE
# is kept for compatibility with older srt-slurm branches (sa-submission-*).
export LM_EVAL_WORKSPACE="$GITHUB_WORKSPACE"
export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"

echo "Submitting job with srtctl..."
Expand Down
Loading