Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion jenkins/BuildDockerImage.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ pipeline {
}
cmd += imageKeyToTag.values().join(" ")
withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
sh cmd
trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200)
}
}
}
Expand Down
15 changes: 10 additions & 5 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo

Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")

Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job completion; sleep 30")

Utils.exec(
pipeline,
Expand All @@ -173,6 +173,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
)
)

Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")

Utils.exec(
pipeline,
script: Utils.sshUserCmd(
Expand Down Expand Up @@ -222,6 +224,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
)
)

Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")

Utils.exec(
pipeline,
script: Utils.sshUserCmd(
Expand Down Expand Up @@ -348,7 +352,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
}

if (CloudManager.isNodeOnline(nodeName)) {
def dockerGpuOption = ""
def dockerGPUOption = ""

node(nodeName) {
sh """
Expand All @@ -367,6 +371,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p

// Dynamically set GPU arguments based on environment variables
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
// It's intentional to check NV_GPU first.
dockerGPUOption = sh(script: """
if [ -n "\$NV_GPU" ]; then
echo "--gpus '\\"device=\$NV_GPU\\"'"
Expand All @@ -386,7 +391,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
"-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
"-v /tmp/ccache:${CCACHE_DIR}:rw " +
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
"--cap-add syslog"
"--cap-add=SYSLOG"

echo "Final dockerArgs: ${dockerArgs}"

Expand Down Expand Up @@ -516,9 +521,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
].join(" ")

def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
// TODO: check if the tee always returns 0
def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
def scriptContent = """#!/bin/bash
set -o pipefail
export jobWorkspace=$jobWorkspace
export tarName=$tarName
export llmTarfile=$llmTarfile
Expand Down