@@ -154,6 +154,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
154154 " -e 's/.*Submitted batch job \\ ([0-9]\\ +\\ ).*/\\ 1/p' " +
155155 " -e 's/.*srun: job \\ ([0-9]\\ +\\ ) queued.*/\\ 1/p' " +
156156 " -e 's/.*srun: job \\ ([0-9]\\ +\\ ) has been allocated.*/\\ 1/p' " +
157+ " -e 's/.*SLURM_JOB_ID=\\ ([0-9]\\ +\\ ).*/\\ 1/p' " +
158+ " -e 's/.*SLURM_JOBID=\\ ([0-9]\\ +\\ ).*/\\ 1/p' " +
157159 " ${ slurmOutputFile} | tail -n1 || true\" "
158160 ),
159161 returnStdout : true
@@ -183,6 +185,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
183185 Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" cat ${ slurmOutputFile} || true\" " ))
184186 echo " Slurm job did not submit successfully. No job ID found."
185187 } else {
188+ // The original Slurm output file name is like "slurm-%j-*.out", we need to replace the %j with the real job ID.
186189 def newSlurmOutputFile = slurmOutputFile. replace(" %j" , slurmJobID)
187190 Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" mv ${ slurmOutputFile} ${ newSlurmOutputFile} || true\" " ))
188191 }
@@ -317,6 +320,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
317320 if (m1) ids << m1[0 ][1 ] // Extract the first captured group
318321 def m2 = (line =~ / srun: job (\d +) (queued|has been allocated)/ )
319322 if (m2) ids << m2[0 ][1 ] // Extract the first captured group
323+ def m3 = (line =~ / SLURM_JOB_ID=(\d +)/ )
324+ if (m3) ids << m3[0 ][1 ] // Extract the first captured group
325+ def m4 = (line =~ / SLURM_JOBID=(\d +)/ )
326+ if (m4) ids << m4[0 ][1 ] // Extract the first captured group
320327 return ids
321328 }
322329
@@ -341,16 +348,37 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
341348 }
342349
343350 if (CloudManager . isNodeOnline(nodeName)) {
351+ def dockerGpuOption = " "
352+
344353 node(nodeName) {
345354 sh """
346355 env | sort
347356 pwd && ls -alh
348357 ls -alh ${ env.WORKSPACE}
349358 ls -alh ${ env.WORKSPACE_TMP}
350359 """
360+
361+ sh " nproc && free -g && hostname"
362+ echoNodeAndGpuInfo(pipeline, stageName)
363+ sh " nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
364+ // Use single quotes to avoid Jenkins variable expansion
365+ sh ' echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"'
366+ sh ' echo "NV_GPU: $NV_GPU"'
367+
368+ // Dynamically set GPU arguments based on environment variables
369+ // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
370+ dockerGPUOption = sh(script : """
371+ if [ -n "\$ NV_GPU" ]; then
372+ echo "--gpus '\\ "device=\$ NV_GPU\\ "'"
373+ elif [ -n "\$ CUDA_VISIBLE_DEVICES" ]; then
374+ echo "--gpus '\\ "device=\$ CUDA_VISIBLE_DEVICES\\ "'"
375+ else
376+ echo "--gpus ${ gpuCount} "
377+ fi
378+ """ , returnStdout : true ). trim()
351379 }
352380
353- def dockerArgs = " --gpus ${ gpuCount } " +
381+ def dockerArgs = " ${ dockerGPUOption } " +
354382 " --cap-add=SYS_ADMIN " +
355383 " --ipc=host " +
356384 " --security-opt seccomp=unconfined " +
@@ -360,6 +388,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
360388 " -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
361389 " --cap-add syslog"
362390
391+ echo " Final dockerArgs: ${ dockerArgs} "
392+
363393 if (partition. clusterName == " dlcluster" ) {
364394 dockerArgs + = " -e NVIDIA_IMEX_CHANNELS=0"
365395 }
@@ -370,12 +400,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
370400 error " The Slurm node does not come online in the waiting period. Terminating the job."
371401 }
372402 }
373- } catch (Exception e) {
374- if (e. getMessage()?. contains(" Failed to kill container" )) {
375- echo " Known benign error ignored: ${ e.getMessage()} "
376- } else {
377- throw e // Re-throw if it's a different IOException
378- }
379403 } finally {
380404 stage(" Clean up SLURM Resources" ) {
381405 // Workaround to handle the interruption during clean up SLURM resources
@@ -939,7 +963,14 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
939963
940964def echoNodeAndGpuInfo (pipeline , stageName )
941965{
942- String hostNodeName = sh(script : ' echo $HOST_NODE_NAME' , returnStdout : true )
966+ String hostNodeName = sh(script : '''
967+ if [ -n "$HOST_NODE_NAME" ]; then
968+ echo "$HOST_NODE_NAME"
969+ else
970+ hostname -f
971+ fi
972+ ''' , returnStdout : true ). trim()
973+
943974 String gpuUuids = pipeline. sh(script : " nvidia-smi -q | grep \" GPU UUID\" | awk '{print \$ 4}' | tr '\n ' ',' || true" , returnStdout : true )
944975 pipeline. echo " HOST_NODE_NAME = ${ hostNodeName} ; GPU_UUIDS = ${ gpuUuids} ; STAGE_NAME = ${ stageName} "
945976}
@@ -1013,7 +1044,7 @@ def launchTestListCheck(pipeline)
10131044 trtllm_utils. llmExecStepWithRetry(pipeline, script : """ apt-get update && apt-get install \
10141045 libffi-dev \
10151046 -y""" )
1016- sh " nvidia-smi -q "
1047+ sh " nvidia-smi && nvidia-smi -q && nvidia-smi topo -m "
10171048 // download TRT-LLM tarfile
10181049 def tarName = BUILD_CONFIGS [VANILLA_CONFIG ][TARNAME ]
10191050 def llmTarfile = " https://urm.nvidia.com/artifactory/${ ARTIFACT_PATH} /${ tarName} "
@@ -1421,8 +1452,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
14211452 sh " nproc && free -g && hostname"
14221453 echoNodeAndGpuInfo(pipeline, stageName)
14231454 sh " cat ${ MODEL_CACHE_DIR} /README"
1424- sh " nvidia-smi -q"
1425- sh " nvidia-smi topo -m"
1455+ sh " nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
14261456 sh " df -h"
14271457
14281458 // setup HF_HOME to cache model and datasets
@@ -1798,7 +1828,7 @@ def runPackageSanityCheck(pipeline, wheel_path, reinstall_dependencies=false, cp
17981828 sh " nproc && free -g && hostname"
17991829 sh " bash -c 'pip3 show tensorrt || true'"
18001830 sh " cat ${ MODEL_CACHE_DIR} /README"
1801- sh " nvidia-smi -q "
1831+ sh " nvidia-smi && nvidia-smi -q && nvidia-smi topo -m "
18021832
18031833 sh " pwd && ls -alh"
18041834 trtllm_utils. llmExecStepWithRetry(pipeline, script : " wget -nv ${ whlUrl} " )
@@ -1849,33 +1879,26 @@ def checkStageName(stageNames) {
18491879 }
18501880}
18511881
1852- // TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
18531882def runInDockerOnNodeMultiStage (image , label , dockerArgs , needToDeleteDir = true )
18541883{
18551884 return {
18561885 runner -> node(label) {
1857- if (needToDeleteDir) {
1858- deleteDir()
1859- }
1860- stage(' Pull Docker Image' ) {
1861- docker. image(image). pull()
1862- }
1863- docker. image(image). inside(dockerArgs) {
1864- runner()
1865- }
1866- }
1867- }
1868- }
1869-
1870- def runInDockerOnNode (image , label , dockerArgs )
1871- {
1872- return {
1873- stageName, runner -> stage(stageName) {
1874- node(label) {
1875- deleteDir()
1886+ try {
1887+ if (needToDeleteDir) {
1888+ deleteDir()
1889+ }
1890+ stage(' Pull Docker Image' ) {
1891+ docker. image(image). pull()
1892+ }
18761893 docker. image(image). inside(dockerArgs) {
18771894 runner()
18781895 }
1896+ } catch (Exception e) {
1897+ if (e. getMessage()?. contains(" Failed to kill container" )) {
1898+ echo " Known benign error ignored: ${ e.getMessage()} "
1899+ } else {
1900+ throw e // Re-throw if it's a different IOException
1901+ }
18791902 }
18801903 }
18811904 }
@@ -1893,10 +1916,8 @@ def runInKubernetes(pipeline, podSpec, containerName)
18931916 }
18941917}
18951918
1896- def launchTestJobs (pipeline , testFilter , dockerNode = null )
1919+ def launchTestJobs (pipeline , testFilter )
18971920{
1898- def dockerArgs = " -v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${ CCACHE_DIR} :rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
1899-
19001921 // IMPORTANT: Stage Configuration Syntax Requirement
19011922 //
19021923 // The test_to_stage_mapping.py script expects stage definitions in the following format:
@@ -2044,8 +2065,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
20442065 fullSet + = SBSATestConfigs . keySet()
20452066
20462067 SBSASlurmTestConfigs = [
2047- // Disable GB200-PyTorch-1 due to OOM (https://nvbugspro.nvidia.com/bug/5490507)
2048- // "GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 1],
2068+ " GB200-PyTorch-1" : [" gb200-single" , " l0_gb200" , 1 , 1 ],
20492069 " GB200-4_GPUs-PyTorch-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
20502070 " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
20512071 ]
@@ -2199,12 +2219,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21992219 def buildRunner = runInKubernetes(pipeline, buildSpec, " trt-llm" )
22002220 def sanityRunner = null
22012221
2202- if (dockerNode) {
2203- sanityRunner = runInDockerOnNode(values[0 ], dockerNode, dockerArgs)
2204- } else {
2205- def sanitySpec = createKubernetesPodConfig(values[0 ], gpu_type, k8s_arch)
2206- sanityRunner = runInKubernetes(pipeline, sanitySpec, " trt-llm" )
2207- }
2222+
2223+ def sanitySpec = createKubernetesPodConfig(values[0 ], gpu_type, k8s_arch)
2224+ sanityRunner = runInKubernetes(pipeline, sanitySpec, " trt-llm" )
22082225
22092226 def wheelPath = " ${ values[4]} "
22102227 def wheelName = " "
@@ -2448,17 +2465,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
24482465 stage(" Skip - reused" ) {
24492466 echo " Skip - Passed in the last pipeline."
24502467 }
2451- } else if (values instanceof List && dockerNode == null ) {
2468+ } else if (values instanceof List ) {
24522469 trtllm_utils. launchKubernetesPod(pipeline, values[0 ], " trt-llm" , {
24532470 values[1 ]()
24542471 })
2455- } else if (values instanceof List && dockerNode != null ) {
2456- node(dockerNode) {
2457- deleteDir()
2458- docker. image(LLM_DOCKER_IMAGE ). inside(dockerArgs) {
2459- values[1 ]()
2460- }
2461- }
24622472 } else {
24632473 values()
24642474 }
0 commit comments