@@ -169,7 +169,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
169169
170170 Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " )
171171
172- Utils . exec(pipeline, script : " echo Sleeping to allow slurm job termination ; sleep 30" )
172+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job completion ; sleep 30" )
173173
174174 Utils . exec(
175175 pipeline,
@@ -179,6 +179,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
179179 )
180180 )
181181
182+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job termination; sleep 30" )
183+
182184 Utils . exec(
183185 pipeline,
184186 script : Utils . sshUserCmd(
@@ -228,6 +230,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
228230 )
229231 )
230232
233+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job termination; sleep 30" )
234+
231235 Utils . exec(
232236 pipeline,
233237 script : Utils . sshUserCmd(
@@ -354,7 +358,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
354358 }
355359
356360 if (CloudManager . isNodeOnline(nodeName)) {
357- def dockerGpuOption = " "
361+ def dockerGPUOption = " "
358362
359363 node(nodeName) {
360364 sh """
@@ -373,6 +377,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
373377
374378 // Dynamically set GPU arguments based on environment variables
375379 // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
380+ // It's intentional to check NV_GPU first.
376381 dockerGPUOption = sh(script : """
377382 if [ -n "\$ NV_GPU" ]; then
378383 echo "--gpus '\\ "device=\$ NV_GPU\\ "'"
@@ -392,7 +397,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
392397 " -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
393398 " -v /tmp/ccache:${ CCACHE_DIR} :rw " +
394399 " -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
395- " --cap-add syslog "
400+ " --cap-add=SYSLOG "
396401
397402 echo " Final dockerArgs: ${ dockerArgs} "
398403
@@ -522,9 +527,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
522527 ]. join(" " )
523528
524529 def srunCmd = SlurmConfig . generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
525- scriptLaunchDestPath = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
526- // TODO: check if the tee always returns 0
530+ def scriptLaunchDestPath = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
527531 def scriptContent = """ #!/bin/bash
532+ set -o pipefail
528533 export jobWorkspace=$jobWorkspace
529534 export tarName=$tarName
530535 export llmTarfile=$llmTarfile
0 commit comments