diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy index 64e03de476a..96f0bf7fbc4 100644 --- a/jenkins/BuildDockerImage.groovy +++ b/jenkins/BuildDockerImage.groovy @@ -684,7 +684,7 @@ pipeline { } cmd += imageKeyToTag.values().join(" ") withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) { - sh cmd + trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200) } } } diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 197ad8d28eb..5feccc076d1 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -163,7 +163,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}") - Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30") + Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job completion; sleep 30") Utils.exec( pipeline, @@ -173,6 +173,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo ) ) + Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30") + Utils.exec( pipeline, script: Utils.sshUserCmd( @@ -222,6 +224,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St ) ) + Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30") + Utils.exec( pipeline, script: Utils.sshUserCmd( @@ -348,7 +352,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p } if (CloudManager.isNodeOnline(nodeName)) { - def dockerGpuOption = "" + def dockerGPUOption = "" node(nodeName) { sh """ @@ -367,6 +371,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p // Dynamically set GPU arguments based on environment variables // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html + // It's intentional to check NV_GPU first. dockerGPUOption = sh(script: """ if [ -n "\$NV_GPU" ]; then echo "--gpus '\\"device=\$NV_GPU\\"'" @@ -386,7 +391,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p "-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " + "-v /tmp/ccache:${CCACHE_DIR}:rw " + "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " + - "--cap-add syslog" + "--cap-add=SYSLOG" echo "Final dockerArgs: ${dockerArgs}" @@ -516,9 +521,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL ].join(" ") def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode) - scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh") - // TODO: check if the tee always returns 0 + def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh") def scriptContent = """#!/bin/bash + set -o pipefail export jobWorkspace=$jobWorkspace export tarName=$tarName export llmTarfile=$llmTarfile