Skip to content

Commit e895139

Browse files
committed
[None][fix] Fix a typo in the Slurm CI codes
Signed-off-by: Yanchao Lu <[email protected]>
1 parent aae5d22 commit e895139

File tree

2 files changed

+7
-5
lines changed

2 files changed

+7
-5
lines changed

jenkins/BuildDockerImage.groovy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,7 @@ pipeline {
684684
}
685685
cmd += imageKeyToTag.values().join(" ")
686686
withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
687-
sh cmd
687+
trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200))
688688
}
689689
}
690690
}

jenkins/L0_Test.groovy

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
348348
}
349349

350350
if (CloudManager.isNodeOnline(nodeName)) {
351-
def dockerGpuOption = ""
351+
def dockerGPUOption = ""
352352

353353
node(nodeName) {
354354
sh """
@@ -367,6 +367,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
367367

368368
// Dynamically set GPU arguments based on environment variables
369369
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
370+
// It's intentional to check NV_GPU first.
370371
dockerGPUOption = sh(script: """
371372
if [ -n "\$NV_GPU" ]; then
372373
echo "--gpus '\\"device=\$NV_GPU\\"'"
@@ -386,7 +387,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
386387
"-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
387388
"-v /tmp/ccache:${CCACHE_DIR}:rw " +
388389
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
389-
"--cap-add syslog"
390+
"--cap-add SYSLOG"
390391

391392
echo "Final dockerArgs: ${dockerArgs}"
392393

@@ -516,9 +517,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
516517
].join(" ")
517518

518519
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
519-
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
520-
// TODO: check if the tee always returns 0
520+
def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
521521
def scriptContent = """#!/bin/bash
522+
set -o pipefail
522523
export jobWorkspace=$jobWorkspace
523524
export tarName=$tarName
524525
export llmTarfile=$llmTarfile
@@ -536,6 +537,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
536537
export NVIDIA_IMEX_CHANNELS=0
537538
chmod +x ${scriptRunNode}
538539
${srunCmd} 2>&1 | tee ${slurmOutputFile}
540+
exit ${PIPESTATUS[0]}
539541
""".stripIndent()
540542
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
541543
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)

0 commit comments

Comments
 (0)