@@ -348,7 +348,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
348348            }
349349
350350            if  (CloudManager . isNodeOnline(nodeName)) {
351-                 def  dockerGpuOption  =  " " 
351+                 def  dockerGPUOption  =  " " 
352352
353353                node(nodeName) {
354354                    sh """ 
@@ -367,6 +367,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
367367
368368                    //  Dynamically set GPU arguments based on environment variables
369369                    //  https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
370+                     //  It's intentional to check NV_GPU first.
370371                    dockerGPUOption =  sh(script : """ 
371372                        if [ -n "\$ NV_GPU" ]; then 
372373                            echo "--gpus '\\ "device=\$ NV_GPU\\ "'" 
@@ -386,7 +387,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
386387                    " -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " + 
387388                    " -v /tmp/ccache:${ CCACHE_DIR} " + 
388389                    " -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " + 
389-                     " --cap-add syslog " 
390+                     " --cap-add SYSLOG " 
390391
391392                echo " Final dockerArgs: ${ dockerArgs} " 
392393
@@ -516,9 +517,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
516517                ]. join("  " 
517518
518519                def  srunCmd =  SlurmConfig . generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
519-                 scriptLaunchDestPath =  Utils . createTempLocation(pipeline, " ./slurm_launch.sh" 
520-                 //  TODO: check if the tee always returns 0
520+                 def  scriptLaunchDestPath =  Utils . createTempLocation(pipeline, " ./slurm_launch.sh" 
521521                def  scriptContent =  """ #!/bin/bash
522+                     set -o pipefail 
522523                    export jobWorkspace=$jobWorkspace   
523524                    export tarName=$tarName   
524525                    export llmTarfile=$llmTarfile   
@@ -536,6 +537,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
536537                    export NVIDIA_IMEX_CHANNELS=0 
537538                    chmod +x ${ scriptRunNode}  
538539                    ${ srunCmd} ${ slurmOutputFile}  
540+                     exit ${ PIPESTATUS[0]}  
539541                """  . stripIndent()
540542                pipeline. writeFile(file : scriptLaunchDestPath, text : scriptContent)
541543                Utils . exec(pipeline, script : " chmod +x ${ scriptLaunchDestPath} " returnStdout : true )
0 commit comments