Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion jenkins/BuildDockerImage.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ pipeline {
}
cmd += imageKeyToTag.values().join(" ")
withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
sh cmd
trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200))
}
}
}
Expand Down
10 changes: 6 additions & 4 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
}

if (CloudManager.isNodeOnline(nodeName)) {
def dockerGpuOption = ""
def dockerGPUOption = ""

node(nodeName) {
sh """
Expand All @@ -367,6 +367,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p

// Dynamically set GPU arguments based on environment variables
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
// It's intentional to check NV_GPU first.
dockerGPUOption = sh(script: """
if [ -n "\$NV_GPU" ]; then
echo "--gpus '\\"device=\$NV_GPU\\"'"
Expand All @@ -386,7 +387,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
"-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
"-v /tmp/ccache:${CCACHE_DIR}:rw " +
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
"--cap-add syslog"
"--cap-add SYSLOG"

echo "Final dockerArgs: ${dockerArgs}"

Expand Down Expand Up @@ -516,9 +517,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
].join(" ")

def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
// TODO: check if the tee always returns 0
def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
def scriptContent = """#!/bin/bash
set -o pipefail
export jobWorkspace=$jobWorkspace
export tarName=$tarName
export llmTarfile=$llmTarfile
Expand All @@ -536,6 +537,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
export NVIDIA_IMEX_CHANNELS=0
chmod +x ${scriptRunNode}
${srunCmd} 2>&1 | tee ${slurmOutputFile}
exit ${PIPESTATUS[0]}
""".stripIndent()
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
Expand Down
Loading