@@ -154,6 +154,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
154154 " -e 's/.*Submitted batch job \\ ([0-9]\\ +\\ ).*/\\ 1/p' " +
155155 " -e 's/.*srun: job \\ ([0-9]\\ +\\ ) queued.*/\\ 1/p' " +
156156 " -e 's/.*srun: job \\ ([0-9]\\ +\\ ) has been allocated.*/\\ 1/p' " +
157+ " -e 's/.*SLURM_JOB_ID=\\ ([0-9]\\ +\\ ).*/\\ 1/p' " +
158+ " -e 's/.*SLURM_JOBID=\\ ([0-9]\\ +\\ ).*/\\ 1/p' " +
157159 " ${ slurmOutputFile} | tail -n1 || true\" "
158160 ),
159161 returnStdout : true
@@ -183,6 +185,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
183185 Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" cat ${ slurmOutputFile} || true\" " ))
184186 echo " Slurm job did not submit successfully. No job ID found."
185187 } else {
188+ // The original Slurm output file name is like "slurm-%j-*.out", we need to replace the %j with the real job ID.
186189 def newSlurmOutputFile = slurmOutputFile. replace(" %j" , slurmJobID)
187190 Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" mv ${ slurmOutputFile} ${ newSlurmOutputFile} || true\" " ))
188191 }
@@ -317,6 +320,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
317320 if (m1) ids << m1[0 ][1 ] // Extract the first captured group
318321 def m2 = (line =~ / srun: job (\d +) (queued|has been allocated)/ )
319322 if (m2) ids << m2[0 ][1 ] // Extract the first captured group
323+ def m3 = (line =~ / SLURM_JOB_ID=(\d +)/ )
324+ if (m3) ids << m3[0 ][1 ] // Extract the first captured group
325+ def m4 = (line =~ / SLURM_JOBID=(\d +)/ )
326+ if (m4) ids << m4[0 ][1 ] // Extract the first captured group
320327 return ids
321328 }
322329
@@ -370,12 +377,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
370377 error " The Slurm node does not come online in the waiting period. Terminating the job."
371378 }
372379 }
373- } catch (Exception e) {
374- if (e. getMessage()?. contains(" Failed to kill container" )) {
375- echo " Known benign error ignored: ${ e.getMessage()} "
376- } else {
377- throw e // Re-throw if it's a different IOException
378- }
379380 } finally {
380381 stage(" Clean up SLURM Resources" ) {
381382 // Workaround to handle the interruption during clean up SLURM resources
@@ -1013,7 +1014,7 @@ def launchTestListCheck(pipeline)
10131014 trtllm_utils. llmExecStepWithRetry(pipeline, script : """ apt-get update && apt-get install \
10141015 libffi-dev \
10151016 -y""" )
1016- sh " nvidia-smi -q "
1017+ sh " nvidia-smi && nvidia-smi -q && nvidia-smi topo -m "
10171018 // download TRT-LLM tarfile
10181019 def tarName = BUILD_CONFIGS [VANILLA_CONFIG ][TARNAME ]
10191020 def llmTarfile = " https://urm.nvidia.com/artifactory/${ ARTIFACT_PATH} /${ tarName} "
@@ -1421,8 +1422,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
14211422 sh " nproc && free -g && hostname"
14221423 echoNodeAndGpuInfo(pipeline, stageName)
14231424 sh " cat ${ MODEL_CACHE_DIR} /README"
1424- sh " nvidia-smi -q"
1425- sh " nvidia-smi topo -m"
1425+ sh " nvidia-smi && nvidia-smi s-q && nvidia-smi topo -m"
14261426 sh " df -h"
14271427
14281428 // setup HF_HOME to cache model and datasets
@@ -1798,7 +1798,7 @@ def runPackageSanityCheck(pipeline, wheel_path, reinstall_dependencies=false, cp
17981798 sh " nproc && free -g && hostname"
17991799 sh " bash -c 'pip3 show tensorrt || true'"
18001800 sh " cat ${ MODEL_CACHE_DIR} /README"
1801- sh " nvidia-smi -q "
1801+ sh " nvidia-smi && nvidia-smi -q && nvidia-smi topo -m "
18021802
18031803 sh " pwd && ls -alh"
18041804 trtllm_utils. llmExecStepWithRetry(pipeline, script : " wget -nv ${ whlUrl} " )
@@ -1849,7 +1849,6 @@ def checkStageName(stageNames) {
18491849 }
18501850}
18511851
1852- // TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
18531852def runInDockerOnNodeMultiStage (image , label , dockerArgs , needToDeleteDir = true )
18541853{
18551854 return {
@@ -1863,18 +1862,11 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
18631862 docker. image(image). inside(dockerArgs) {
18641863 runner()
18651864 }
1866- }
1867- }
1868- }
1869-
1870- def runInDockerOnNode (image , label , dockerArgs )
1871- {
1872- return {
1873- stageName, runner -> stage(stageName) {
1874- node(label) {
1875- deleteDir()
1876- docker. image(image). inside(dockerArgs) {
1877- runner()
1865+ catch (Exception e) {
1866+ if (e. getMessage()?. contains(" Failed to kill container" )) {
1867+ echo " Known benign error ignored: ${ e.getMessage()} "
1868+ } else {
1869+ throw e // Re-throw if it's a different IOException
18781870 }
18791871 }
18801872 }
@@ -1893,10 +1885,8 @@ def runInKubernetes(pipeline, podSpec, containerName)
18931885 }
18941886}
18951887
1896- def launchTestJobs (pipeline , testFilter , dockerNode = null )
1888+ def launchTestJobs (pipeline , testFilter )
18971889{
1898- def dockerArgs = " -v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${ CCACHE_DIR} :rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
1899-
19001890 // IMPORTANT: Stage Configuration Syntax Requirement
19011891 //
19021892 // The test_to_stage_mapping.py script expects stage definitions in the following format:
@@ -2044,7 +2034,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
20442034 fullSet + = SBSATestConfigs . keySet()
20452035
20462036 SBSASlurmTestConfigs = [
2047- " GB200-PyTorch-1" : [" gb200-single" , " l0_gb200" , 1 , 1 ],
2037+ // Should use "gb200-single" instead of "gb200-x4" for single GPU testing
2038+ " GB200-PyTorch-1" : [" gb200-single" , " l0_gb200" , 1 , 2 ],
2039+ " GB200-PyTorch-2" : [" gb200-x4" , " l0_gb200" , 2 , 2 ],
20482040 " GB200-4_GPUs-PyTorch-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
20492041 " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
20502042 ]
@@ -2198,12 +2190,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21982190 def buildRunner = runInKubernetes(pipeline, buildSpec, " trt-llm" )
21992191 def sanityRunner = null
22002192
2201- if (dockerNode) {
2202- sanityRunner = runInDockerOnNode(values[0 ], dockerNode, dockerArgs)
2203- } else {
2204- def sanitySpec = createKubernetesPodConfig(values[0 ], gpu_type, k8s_arch)
2205- sanityRunner = runInKubernetes(pipeline, sanitySpec, " trt-llm" )
2206- }
2193+
2194+ def sanitySpec = createKubernetesPodConfig(values[0 ], gpu_type, k8s_arch)
2195+ sanityRunner = runInKubernetes(pipeline, sanitySpec, " trt-llm" )
22072196
22082197 def wheelPath = " ${ values[4]} "
22092198 def wheelName = " "
@@ -2447,17 +2436,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
24472436 stage(" Skip - reused" ) {
24482437 echo " Skip - Passed in the last pipeline."
24492438 }
2450- } else if (values instanceof List && dockerNode == null ) {
2439+ } else if (values instanceof List ) {
24512440 trtllm_utils. launchKubernetesPod(pipeline, values[0 ], " trt-llm" , {
24522441 values[1 ]()
24532442 })
2454- } else if (values instanceof List && dockerNode != null ) {
2455- node(dockerNode) {
2456- deleteDir()
2457- docker. image(LLM_DOCKER_IMAGE ). inside(dockerArgs) {
2458- values[1 ]()
2459- }
2460- }
24612443 } else {
24622444 values()
24632445 }
0 commit comments