1- @Library ([' bloom-jenkins-shared-lib@dev-yanchaol-slurm-output ' , ' trtllm-jenkins-shared-lib@main' ]) _
1+ @Library ([' bloom-jenkins-shared-lib@main ' , ' trtllm-jenkins-shared-lib@main' ]) _
22
33import java.lang.InterruptedException
44import groovy.transform.Field
@@ -44,8 +44,9 @@ DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.06-py3"
4444UBUNTU_22_04_IMAGE = " urm.nvidia.com/docker/ubuntu:22.04"
4545UBUNTU_24_04_IMAGE = " urm.nvidia.com/docker/ubuntu:24.04"
4646
47- POD_TIMEOUT_SECONDS = env. podTimeoutSeconds ? env. podTimeoutSeconds : " 21600"
48- POD_TIMEOUT_SECONDS_TMP = env. podTimeoutSeconds ? env. podTimeoutSeconds : " 43200"
47+ POD_TIMEOUT_SECONDS_TEST = env. podTimeoutSeconds ? env. podTimeoutSeconds : " 21600"
48+ POD_TIMEOUT_SECONDS_BUILD = env. podTimeoutSeconds ? env. podTimeoutSeconds : " 43200"
49+ POD_TIMEOUT_SECONDS_SLURM = env. podTimeoutSeconds ? env. podTimeoutSeconds : " 79200" // Use 22 hours to allow for 2 hour of buffer.
4950
5051// Literals for easier access.
5152@Field
@@ -133,7 +134,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
133134}
134135
135136// TODO: consolidate slurm related code for both multi nodes and single nodes
136- def cleanUpNodeResourcesMultiNodes (def pipeline , SlurmCluster cluster , String jobUID ) {
137+ def cleanUpNodeResourcesMultiNodes (def pipeline , SlurmCluster cluster , String jobUID , String slurmOutputFile ) {
137138 withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' , usernameVariable : ' USERNAME' , passwordVariable : ' PASSWORD' )]) {
138139 def remote = [
139140 ip : cluster. ip,
@@ -144,20 +145,50 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
144145 ]
145146
146147 Utils . exec(pipeline, script : " apt-get update && apt-get install -y sshpass openssh-client" )
147- pipeline. stage(' Clean up SLURM Agent Resources' ) {
148- Utils . exec(
149- pipeline,
150- timeout : false ,
151- script : Utils . sshUserCmd(
152- remote,
153- " rm -rf /home/svc_tensorrt/bloom/scripts/${ jobUID} "
154- )
155- )
148+
149+ def slurmJobID = Utils . exec(
150+ pipeline,
151+ script : Utils . sshUserCmd(
152+ remote,
153+ " \" sed -n " +
154+ " -e 's/.*Submitted batch job \\ ([0-9]\\ +\\ ).*/\\ 1/p' " +
155+ " -e 's/.*srun: job \\ ([0-9]\\ +\\ ) queued.*/\\ 1/p' " +
156+ " -e 's/.*srun: job \\ ([0-9]\\ +\\ ) has been allocated.*/\\ 1/p' " +
157+ " ${ slurmOutputFile} | tail -n1\" "
158+ ),
159+ returnStdout : true
160+ ). trim()
161+
162+ if (! slurmJobID || ! slurmJobID. isNumber()) {
163+ Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" cat ${ slurmOutputFile} \" " ))
164+ error(" Slurm job did not submit successfully. No job ID found." )
156165 }
166+
167+ Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " )
168+
169+ Utils . exec(pipeline, script : " echo Sleeping to allow slurm job termination; sleep 30" )
170+
171+ Utils . exec(
172+ pipeline,
173+ script : Utils . sshUserCmd(
174+ remote,
175+ " \" scancel ${ slurmJobID} || true; sacct -j ${ slurmJobID} --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job ${ slurmJobID} || true\" "
176+ )
177+ )
178+
179+ Utils . exec(
180+ pipeline,
181+ script : Utils . sshUserCmd(
182+ remote,
183+ " rm -rf /home/svc_tensorrt/bloom/scripts/${ jobUID} "
184+ )
185+ )
186+
187+ Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} cleaned up" )
157188 }
158189}
159190
160- def cleanUpNodeResources (def pipeline , SlurmCluster cluster , String nodeName ) {
191+ def cleanUpNodeResources (def pipeline , SlurmCluster cluster , String nodeName , String slurmJobID ) {
161192 withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' , usernameVariable : ' USERNAME' , passwordVariable : ' PASSWORD' )]) {
162193 def remote = [
163194 ip : cluster. ip,
@@ -168,17 +199,26 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
168199 ]
169200
170201 Utils . exec(pipeline, script : " apt-get update && apt-get install -y sshpass openssh-client" )
171- pipeline . stage( ' Clean up SLURM Agent Resources ' ) {
172- Utils . exec(
173- pipeline,
174- timeout : false ,
175- script : Utils . sshUserCmd(
176- remote,
177- " rm -rf /home/svc_tensorrt/bloom/scripts/agent- ${ nodeName } .jar /home/svc_tensorrt/bloom/scripts/ ${ nodeName } -slurm_jenkins_agent_setup.sh "
178- )
202+
203+ Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID } " )
204+
205+ Utils . exec(
206+ pipeline,
207+ script : Utils . sshUserCmd(
208+ remote,
209+ " \" scancel ${ slurmJobID } || true; sacct -j ${ slurmJobID } --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job ${ slurmJobID } || true \" "
179210 )
180- Utils . exec(pipeline, script : " echo done" )
181- }
211+ )
212+
213+ Utils . exec(
214+ pipeline,
215+ script : Utils . sshUserCmd(
216+ remote,
217+ " rm -rf /home/svc_tensorrt/bloom/scripts/agent-${ nodeName} .jar /home/svc_tensorrt/bloom/scripts/${ nodeName} -slurm_jenkins_agent_setup.sh"
218+ )
219+ )
220+
221+ Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} cleaned up" )
182222 }
183223}
184224
@@ -224,6 +264,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
224264 def customWorkspace = " /tmp/${ nodeName} "
225265 def nodeSecret = CloudManager . createNode(nodeName, customWorkspace)
226266
267+ def slurmJobID = null
268+
227269 try {
228270 // Run ssh command to start node in desired cluster via SLURM
229271 withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' , usernameVariable : ' USERNAME' , passwordVariable : ' PASSWORD' )]) {
@@ -245,24 +287,47 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
245287
246288 Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ jenkinsSetupPath} ${ remote.user} @${ remote.host} :~/bloom/scripts/${ nodeName} -slurm_jenkins_agent_setup.sh" , numRetries : 3 ,)
247289
248- sh( label : " Print slurm_jenkins_agent_setup.sh script " , script : " cat ${ jenkinsSetupPath} " )
290+ Utils . exec(pipeline , script : " cat ${ jenkinsSetupPath} " )
249291
250- Utils . exec(
292+ def slurmSubmitOutput = Utils . exec(
251293 pipeline,
252294 timeout : false ,
253295 script : Utils . sshUserCmd(
254- remote,
255- """ ${ SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)} """
256- )
296+ remote,
297+ " \" ${ SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)} \" "
298+ ),
299+ returnStdout : true
257300 )
301+
302+ def jobIDs = slurmSubmitOutput
303+ .readLines()
304+ .collect { it. trim() }
305+ .collectMany { line ->
306+ def ids = []
307+ def m1 = (line =~ / Submitted batch job (\d +)/ )
308+ if (m1) ids << m1[0 ][1 ] // Extract the first captured group
309+ def m2 = (line =~ / srun: job (\d +) (queued|has been allocated)/ )
310+ if (m2) ids << m2[0 ][1 ] // Extract the first captured group
311+ return ids
312+ }
313+
314+ slurmJobID = jobIDs ? jobIDs[-1 ] : null
315+
316+ if (! slurmJobID || ! slurmJobID. isNumber()) {
317+ error(" Slurm job did not submit successfully. No job ID found.\n Submission output:\n ${ slurmSubmitOutput} " )
318+ }
319+ Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " )
258320 Utils . exec(pipeline, script : " echo Sleeping to allow agent initialization; sleep 30" )
259321 }
260322 }
261323
262324 stage(' Checking if the Node is Online' ) {
263325 def counter = 0
264- while (! CloudManager . isNodeOnline(nodeName) && counter < 12 ) {
265- sleep(time : 10 , unit : ' MINUTES' ) // Wait 10 minutes to check status of the node again
326+ // We submit the Slurm job with 5 hours timeout, and the K8S pod will be evicted after 22 hours.
327+ // Let's use 15 hours to check if the node is online, and with 2 hours buffer.
328+ while (! CloudManager . isNodeOnline(nodeName) && counter < 90 ) {
329+ // Wait 10 minutes to check status of the node again
330+ sleep(time : 10 , unit : ' MINUTES' )
266331 counter++
267332 }
268333
@@ -293,12 +358,16 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
293358 slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, true )
294359 executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
295360 } else {
296- echo " The node does not come online in 2 hours, terminating the job"
361+ error " The Slurm node does not come online in the waiting period. Terminating the job. "
297362 }
298363 }
299364 } finally {
300- cleanUpNodeResources(pipeline, cluster, nodeName)
301- CloudManager . destroyNode(nodeName)
365+ stage(' Clean up SLURM Resources' ) {
366+ Utils . exec(pipeline, script : " echo Sleeping to allow docker stop; sleep 30" )
367+ CloudManager . destroyNode(nodeName)
368+ Utils . exec(pipeline, script : " echo Sleeping to allow node destruction; sleep 30" )
369+ cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
370+ }
302371 }
303372}
304373
@@ -321,13 +390,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
321390 String customSuffix = " ${ env.BUILD_TAG} -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
322391 def jobUID = " ${ cluster.host} -multi_node_test-${ customSuffix} "
323392
324- sh(
325- label : " Print env for debugging" ,
326- script : """
327- env | sort
328- pwd && ls -alh
329- """
330- )
393+ Utils . exec(pipeline, script : " env | sort && pwd && ls -alh" )
394+
395+ def slurmOutputFile = null
331396
332397 try {
333398 // Run ssh command to start node in desired cluster via SLURM
@@ -353,7 +418,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
353418 def resourcePathNode = " /tmp"
354419 def llmSrcNode = " ${ resourcePathNode} /TensorRT-LLM/src"
355420 def llmSrcLocal = " ${ llmPath} /TensorRT-LLM/src"
356- def scriptRunNode = " ${ jobWorkspace} /slurm_run.sh"
421+ def scriptRunNode = " ${ jobWorkspace} /${ jobUID} -slurm_run.sh"
422+ def scriptLaunch = " ${ jobWorkspace} /${ jobUID} -slurm_launch.sh"
423+ slurmOutputFile = " ${ jobWorkspace} /${ jobUID} -slurm_output.log"
357424 def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
358425 def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
359426 def isAarch64 = config. contains(" aarch64" )
@@ -370,8 +437,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
370437 // Upload slurm_run_sh to Frontend node
371438 def scriptRunLocalPath = " ${ llmSrcLocal} /jenkins/scripts/slurm_run.sh"
372439 Utils . exec(pipeline, script : " chmod +x ${ scriptRunLocalPath} " , returnStdout : true )
440+
373441 Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ scriptRunLocalPath} ${ remote.user} @${ remote.host} :${ scriptRunNode} " , numRetries : 3 ,)
374- sh(label : " Print slurm_run.sh script" , script : " cat ${ scriptRunLocalPath} " )
442+ Utils . exec(pipeline, script : " cat ${ scriptRunLocalPath} " )
443+
375444 // Upload waives.txt to Frontend node
376445 def waivesListLocalPath = " ${ llmSrcLocal} /tests/integration/test_lists/waives.txt"
377446 Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ waivesListLocalPath} ${ remote.user} @${ remote.host} :${ waivesListPathNode} " , numRetries : 3 ,)
@@ -403,7 +472,6 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
403472 " --container-env=NVIDIA_IMEX_CHANNELS"
404473 ]. join(" " )
405474
406- def scriptLaunch = " /home/svc_tensorrt/bloom/scripts/${ jobUID} /slurm_launch.sh"
407475 def srunCmd = SlurmConfig . generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
408476 scriptLaunchDestPath = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
409477 def scriptContent = """ #!/bin/bash
@@ -423,28 +491,33 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
423491 export MODEL_CACHE_DIR=$MODEL_CACHE_DIR
424492 export NVIDIA_IMEX_CHANNELS=0
425493 chmod +x ${ scriptRunNode}
426- ${ srunCmd}
494+ ${ srunCmd} 2>&1 | tee ${ slurmOutputFile }
427495 """ . stripIndent()
428496 pipeline. writeFile(file : scriptLaunchDestPath, text : scriptContent)
429497 Utils . exec(pipeline, script : " chmod +x ${ scriptLaunchDestPath} " , returnStdout : true )
430498 Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ scriptLaunchDestPath} ${ remote.user} @${ remote.host} :${ scriptLaunch} " , numRetries : 3 ,)
431- sh( label : " Print slurm_launch.sh script " , script : " cat ${ scriptLaunchDestPath} " )
499+ Utils . exec(pipeline , script : " cat ${ scriptLaunchDestPath} " )
432500 }
501+
433502 stage(' Run Test' ) {
434- def scriptLaunch = " ${ jobWorkspace} /slurm_launch.sh"
435503 Utils . exec(
436504 pipeline,
437505 timeout : false ,
438506 script : Utils . sshUserCmd(
439507 remote,
440- """ bash ${ scriptLaunch} " ""
508+ " \" bash ${ scriptLaunch} \ ""
441509 )
442510 )
443511 }
512+
513+ echo " Finished test stage execution."
444514 }
445515 } finally {
446516 uploadResults(pipeline, cluster, jobUID, stageName)
447- cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID)
517+
518+ stage(' Clean up SLURM Resources' ) {
519+ cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
520+ }
448521 }
449522}
450523
@@ -573,6 +646,14 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
573646 } else {
574647 sh ' if [ "$(id -u)" -eq 0 ]; then dmesg; fi'
575648 if (noResultIfSuccess && ! stageIsFailed) {
649+ // Clean up the workspace
650+ sh """
651+ env | sort
652+ pwd && ls -alh
653+ rm -rf ./*
654+ """
655+
656+ echo " Finished test stage execution."
576657 return
577658 }
578659 echo " noResultIfSuccess: ${ noResultIfSuccess} , stageIsFailed: ${ stageIsFailed} "
@@ -593,14 +674,16 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
593674 " ${ UPLOAD_PATH} /test-results/"
594675 )
595676 junit(testResults : " ${ stageName} /results*.xml" )
596-
597- // Clean up the workspace
598- sh """
599- env | sort
600- pwd && ls -alh
601- rm -rf ./*
602- """
603677 }
678+
679+ // Clean up the workspace
680+ sh """
681+ env | sort
682+ pwd && ls -alh
683+ rm -rf ./*
684+ """
685+
686+ echo " Finished test stage execution."
604687 }
605688}
606689
@@ -643,7 +726,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
643726 containerConfig = """
644727 - name: trt-llm
645728 image: ${ image}
646- command: ['sleep', ${ POD_TIMEOUT_SECONDS } ]
729+ command: ['sleep', ${ POD_TIMEOUT_SECONDS_SLURM } ]
647730 tty: true
648731 resources:
649732 requests:
@@ -661,7 +744,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
661744 containerConfig = """
662745 - name: trt-llm
663746 image: ${ image}
664- command: ['sleep', ${ POD_TIMEOUT_SECONDS_TMP } ]
747+ command: ['sleep', ${ POD_TIMEOUT_SECONDS_BUILD } ]
665748 volumeMounts:
666749 - name: sw-tensorrt-pvc
667750 mountPath: "/mnt/sw-tensorrt-pvc"
@@ -727,7 +810,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
727810 containerConfig = """
728811 - name: trt-llm
729812 image: ${ image}
730- command: ['sleep', ${ POD_TIMEOUT_SECONDS } ]
813+ command: ['sleep', ${ POD_TIMEOUT_SECONDS_TEST } ]
731814 tty: true
732815 resources:
733816 requests:
@@ -2167,10 +2250,13 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21672250 }
21682251 echo " ###### Check pip install Start ######"
21692252 withEnv(libEnv) {
2253+ // Retry 2 times if timeout occurs.
21702254 sh " env | sort"
2171- timeout(time : 30 , unit : ' MINUTES' ) {
2172- checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2173- }
2255+ trtllm_utils. llmRetry(1 , " checkPipInstall" , {
2256+ timeout(time : 30 , unit : ' MINUTES' ) {
2257+ checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2258+ }
2259+ })
21742260 }
21752261 echo " ###### Run LLMAPI tests Start ######"
21762262 def config = VANILLA_CONFIG
0 commit comments