Skip to content

Commit ce2a373

Browse files
committed
[None][ci] Correct docker args for GPU devices and remove some stale CI codes (NVIDIA#7417)
Signed-off-by: Yanchao Lu <[email protected]>
1 parent d381773 commit ce2a373

File tree

2 files changed

+61
-50
lines changed

2 files changed

+61
-50
lines changed

jenkins/L0_Test.groovy

Lines changed: 60 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
154154
"-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
155155
"-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
156156
"-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
157+
"-e 's/.*SLURM_JOB_ID=\\([0-9]\\+\\).*/\\1/p' " +
158+
"-e 's/.*SLURM_JOBID=\\([0-9]\\+\\).*/\\1/p' " +
157159
"${slurmOutputFile} | tail -n1 || true\""
158160
),
159161
returnStdout: true
@@ -183,6 +185,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
183185
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\""))
184186
echo "Slurm job did not submit successfully. No job ID found."
185187
} else {
188+
// The original Slurm output file name is like "slurm-%j-*.out", we need to replace the %j with the real job ID.
186189
def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID)
187190
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\""))
188191
}
@@ -317,6 +320,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
317320
if (m1) ids << m1[0][1] // Extract the first captured group
318321
def m2 = (line =~ /srun: job (\d+) (queued|has been allocated)/)
319322
if (m2) ids << m2[0][1] // Extract the first captured group
323+
def m3 = (line =~ /SLURM_JOB_ID=(\d+)/)
324+
if (m3) ids << m3[0][1] // Extract the first captured group
325+
def m4 = (line =~ /SLURM_JOBID=(\d+)/)
326+
if (m4) ids << m4[0][1] // Extract the first captured group
320327
return ids
321328
}
322329

@@ -341,16 +348,37 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
341348
}
342349

343350
if (CloudManager.isNodeOnline(nodeName)) {
351+
def dockerGpuOption = ""
352+
344353
node(nodeName) {
345354
sh """
346355
env | sort
347356
pwd && ls -alh
348357
ls -alh ${env.WORKSPACE}
349358
ls -alh ${env.WORKSPACE_TMP}
350359
"""
360+
361+
sh "nproc && free -g && hostname"
362+
echoNodeAndGpuInfo(pipeline, stageName)
363+
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
364+
// Use single quotes to avoid Jenkins variable expansion
365+
sh 'echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"'
366+
sh 'echo "NV_GPU: $NV_GPU"'
367+
368+
// Dynamically set GPU arguments based on environment variables
369+
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
370+
dockerGPUOption = sh(script: """
371+
if [ -n "\$NV_GPU" ]; then
372+
echo "--gpus '\\"device=\$NV_GPU\\"'"
373+
elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then
374+
echo "--gpus '\\"device=\$CUDA_VISIBLE_DEVICES\\"'"
375+
else
376+
echo "--gpus ${gpuCount}"
377+
fi
378+
""", returnStdout: true).trim()
351379
}
352380

353-
def dockerArgs = "--gpus ${gpuCount} " +
381+
def dockerArgs = "${dockerGPUOption} " +
354382
"--cap-add=SYS_ADMIN " +
355383
"--ipc=host " +
356384
"--security-opt seccomp=unconfined " +
@@ -360,6 +388,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
360388
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
361389
"--cap-add syslog"
362390

391+
echo "Final dockerArgs: ${dockerArgs}"
392+
363393
if (partition.clusterName == "dlcluster") {
364394
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
365395
}
@@ -370,12 +400,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
370400
error "The Slurm node does not come online in the waiting period. Terminating the job."
371401
}
372402
}
373-
} catch (Exception e) {
374-
if (e.getMessage()?.contains("Failed to kill container")) {
375-
echo "Known benign error ignored: ${e.getMessage()}"
376-
} else {
377-
throw e // Re-throw if it's a different IOException
378-
}
379403
} finally {
380404
stage("Clean up SLURM Resources") {
381405
// Workaround to handle the interruption during clean up SLURM resources
@@ -939,7 +963,14 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
939963

940964
def echoNodeAndGpuInfo(pipeline, stageName)
941965
{
942-
String hostNodeName = sh(script: 'echo $HOST_NODE_NAME', returnStdout: true)
966+
String hostNodeName = sh(script: '''
967+
if [ -n "$HOST_NODE_NAME" ]; then
968+
echo "$HOST_NODE_NAME"
969+
else
970+
hostname -f || hostname
971+
fi
972+
''', returnStdout: true).trim()
973+
943974
String gpuUuids = pipeline.sh(script: "nvidia-smi -q | grep \"GPU UUID\" | awk '{print \$4}' | tr '\n' ',' || true", returnStdout: true)
944975
pipeline.echo "HOST_NODE_NAME = ${hostNodeName} ; GPU_UUIDS = ${gpuUuids} ; STAGE_NAME = ${stageName}"
945976
}
@@ -1013,7 +1044,7 @@ def launchTestListCheck(pipeline)
10131044
trtllm_utils.llmExecStepWithRetry(pipeline, script: """apt-get update && apt-get install \
10141045
libffi-dev \
10151046
-y""")
1016-
sh "nvidia-smi -q"
1047+
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
10171048
// download TRT-LLM tarfile
10181049
def tarName = BUILD_CONFIGS[VANILLA_CONFIG][TARNAME]
10191050
def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}"
@@ -1421,8 +1452,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
14211452
sh "nproc && free -g && hostname"
14221453
echoNodeAndGpuInfo(pipeline, stageName)
14231454
sh "cat ${MODEL_CACHE_DIR}/README"
1424-
sh "nvidia-smi -q"
1425-
sh "nvidia-smi topo -m"
1455+
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
14261456
sh "df -h"
14271457

14281458
// setup HF_HOME to cache model and datasets
@@ -1798,7 +1828,7 @@ def runPackageSanityCheck(pipeline, wheel_path, reinstall_dependencies=false, cp
17981828
sh "nproc && free -g && hostname"
17991829
sh "bash -c 'pip3 show tensorrt || true'"
18001830
sh "cat ${MODEL_CACHE_DIR}/README"
1801-
sh "nvidia-smi -q"
1831+
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
18021832

18031833
sh "pwd && ls -alh"
18041834
trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget -nv ${whlUrl}")
@@ -1849,33 +1879,26 @@ def checkStageName(stageNames) {
18491879
}
18501880
}
18511881

1852-
// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
18531882
def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
18541883
{
18551884
return {
18561885
runner -> node(label) {
1857-
if (needToDeleteDir) {
1858-
deleteDir()
1859-
}
1860-
stage('Pull Docker Image') {
1861-
docker.image(image).pull()
1862-
}
1863-
docker.image(image).inside(dockerArgs) {
1864-
runner()
1865-
}
1866-
}
1867-
}
1868-
}
1869-
1870-
def runInDockerOnNode(image, label, dockerArgs)
1871-
{
1872-
return {
1873-
stageName, runner -> stage(stageName) {
1874-
node(label) {
1875-
deleteDir()
1886+
try {
1887+
if (needToDeleteDir) {
1888+
deleteDir()
1889+
}
1890+
stage('Pull Docker Image') {
1891+
docker.image(image).pull()
1892+
}
18761893
docker.image(image).inside(dockerArgs) {
18771894
runner()
18781895
}
1896+
} catch (Exception e) {
1897+
if (e.getMessage()?.contains("Failed to kill container")) {
1898+
echo "Known benign error ignored: ${e.getMessage()}"
1899+
} else {
1900+
throw e // Re-throw if it's a different IOException
1901+
}
18791902
}
18801903
}
18811904
}
@@ -1893,10 +1916,8 @@ def runInKubernetes(pipeline, podSpec, containerName)
18931916
}
18941917
}
18951918

1896-
def launchTestJobs(pipeline, testFilter, dockerNode=null)
1919+
def launchTestJobs(pipeline, testFilter)
18971920
{
1898-
def dockerArgs = "-v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
1899-
19001921
// IMPORTANT: Stage Configuration Syntax Requirement
19011922
//
19021923
// The test_to_stage_mapping.py script expects stage definitions in the following format:
@@ -2198,12 +2219,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21982219
def buildRunner = runInKubernetes(pipeline, buildSpec, "trt-llm")
21992220
def sanityRunner = null
22002221

2201-
if (dockerNode) {
2202-
sanityRunner = runInDockerOnNode(values[0], dockerNode, dockerArgs)
2203-
} else {
2204-
def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch)
2205-
sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm")
2206-
}
2222+
2223+
def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch)
2224+
sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm")
22072225

22082226
def wheelPath = "${values[4]}"
22092227
def wheelName = ""
@@ -2447,17 +2465,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
24472465
stage("Skip - reused") {
24482466
echo "Skip - Passed in the last pipeline."
24492467
}
2450-
} else if (values instanceof List && dockerNode == null) {
2468+
} else if (values instanceof List) {
24512469
trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", {
24522470
values[1]()
24532471
})
2454-
} else if (values instanceof List && dockerNode != null) {
2455-
node(dockerNode) {
2456-
deleteDir()
2457-
docker.image(LLM_DOCKER_IMAGE).inside(dockerArgs) {
2458-
values[1]()
2459-
}
2460-
}
24612472
} else {
24622473
values()
24632474
}

jenkins/scripts/slurm_run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ if [ $SLURM_LOCALID -eq 0 ]; then
2222
which python3
2323
python3 --version
2424
apt-get install -y libffi-dev
25-
nvidia-smi
25+
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
2626
cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt
2727
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
2828
git config --global --add safe.directory "*"

0 commit comments

Comments
 (0)