@@ -163,7 +163,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
163163
164164 Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " )
165165
166- Utils . exec(pipeline, script : " echo Sleeping to allow slurm job termination ; sleep 30" )
166+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job completion ; sleep 30" )
167167
168168 Utils . exec(
169169 pipeline,
@@ -173,6 +173,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
173173 )
174174 )
175175
176+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job termination; sleep 30" )
177+
176178 Utils . exec(
177179 pipeline,
178180 script : Utils . sshUserCmd(
@@ -222,6 +224,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
222224 )
223225 )
224226
227+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job termination; sleep 30" )
228+
225229 Utils . exec(
226230 pipeline,
227231 script : Utils . sshUserCmd(
@@ -348,7 +352,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
348352 }
349353
350354 if (CloudManager . isNodeOnline(nodeName)) {
351- def dockerGpuOption = " "
355+ def dockerGPUOption = " "
352356
353357 node(nodeName) {
354358 sh """
@@ -367,6 +371,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
367371
368372 // Dynamically set GPU arguments based on environment variables
369373 // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
374+ // It's intentional to check NV_GPU first.
370375 dockerGPUOption = sh(script : """
371376 if [ -n "\$ NV_GPU" ]; then
372377 echo "--gpus '\\ "device=\$ NV_GPU\\ "'"
@@ -386,7 +391,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
386391 " -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
387392 " -v /tmp/ccache:${ CCACHE_DIR} :rw " +
388393 " -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
389- " --cap-add syslog "
394+ " --cap-add=SYSLOG "
390395
391396 echo " Final dockerArgs: ${ dockerArgs} "
392397
@@ -516,9 +521,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
516521 ]. join(" " )
517522
518523 def srunCmd = SlurmConfig . generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
519- scriptLaunchDestPath = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
520- // TODO: check if the tee always returns 0
524+ def scriptLaunchDestPath = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
521525 def scriptContent = """ #!/bin/bash
526+ set -o pipefail
522527 export jobWorkspace=$jobWorkspace
523528 export tarName=$tarName
524529 export llmTarfile=$llmTarfile
0 commit comments