Skip to content

Commit 460a34c

Browse files
authored
[None][chore] Some improvements for CI stability (#7199)
Signed-off-by: Yanchao Lu <[email protected]>
1 parent a419b77 commit 460a34c

File tree

2 files changed

+154
-55
lines changed

2 files changed

+154
-55
lines changed

jenkins/Build.groovy

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,7 @@ LLM_DOCKER_IMAGE = env.dockerImage
1919
// Always use x86_64 image for agent
2020
AGENT_IMAGE = env.dockerImage.replace("aarch64", "x86_64")
2121

22-
POD_TIMEOUT_SECONDS = env.podTimeoutSeconds ? env.podTimeoutSeconds : "21600"
23-
POD_TIMEOUT_SECONDS_TMP = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"
22+
POD_TIMEOUT_SECONDS_BUILD = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"
2423

2524
// Literals for easier access.
2625
@Field
@@ -169,7 +168,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64")
169168
containerConfig = """
170169
- name: trt-llm
171170
image: ${image}
172-
command: ['sleep', ${POD_TIMEOUT_SECONDS_TMP}]
171+
command: ['sleep', ${POD_TIMEOUT_SECONDS_BUILD}]
173172
volumeMounts:
174173
- name: sw-tensorrt-pvc
175174
mountPath: "/mnt/sw-tensorrt-pvc"

jenkins/L0_Test.groovy

Lines changed: 152 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,9 @@ DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.06-py3"
4444
UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04"
4545
UBUNTU_24_04_IMAGE = "urm.nvidia.com/docker/ubuntu:24.04"
4646

47-
POD_TIMEOUT_SECONDS = env.podTimeoutSeconds ? env.podTimeoutSeconds : "21600"
48-
POD_TIMEOUT_SECONDS_TMP = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"
47+
POD_TIMEOUT_SECONDS_TEST = env.podTimeoutSeconds ? env.podTimeoutSeconds : "21600"
48+
POD_TIMEOUT_SECONDS_BUILD = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"
49+
POD_TIMEOUT_SECONDS_SLURM = env.podTimeoutSeconds ? env.podTimeoutSeconds : "79200" // Use 22 hours to allow for 2 hour of buffer.
4950

5051
// Literals for easier access.
5152
@Field
@@ -133,7 +134,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
133134
}
134135

135136
//TODO: consolidate slurm related code for both multi nodes and single nodes
136-
def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID){
137+
def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID, String slurmOutputFile) {
137138
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
138139
def remote = [
139140
ip : cluster.ip,
@@ -144,20 +145,50 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
144145
]
145146

146147
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
147-
pipeline.stage('Clean up SLURM Agent Resources') {
148-
Utils.exec(
149-
pipeline,
150-
timeout: false,
151-
script: Utils.sshUserCmd(
152-
remote,
153-
"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
154-
)
155-
)
148+
149+
def slurmJobID = Utils.exec(
150+
pipeline,
151+
script: Utils.sshUserCmd(
152+
remote,
153+
"\"sed -n " +
154+
"-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
155+
"-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
156+
"-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
157+
"${slurmOutputFile} | tail -n1\""
158+
),
159+
returnStdout: true
160+
).trim()
161+
162+
if (!slurmJobID || !slurmJobID.isNumber()) {
163+
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile}\""))
164+
error("Slurm job did not submit successfully. No job ID found.")
156165
}
166+
167+
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
168+
169+
Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
170+
171+
Utils.exec(
172+
pipeline,
173+
script: Utils.sshUserCmd(
174+
remote,
175+
"\"scancel ${slurmJobID} || true; sacct -j ${slurmJobID} --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job ${slurmJobID} || true\""
176+
)
177+
)
178+
179+
Utils.exec(
180+
pipeline,
181+
script: Utils.sshUserCmd(
182+
remote,
183+
"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
184+
)
185+
)
186+
187+
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
157188
}
158189
}
159190

160-
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
191+
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) {
161192
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
162193
def remote = [
163194
ip : cluster.ip,
@@ -168,17 +199,26 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
168199
]
169200

170201
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
171-
pipeline.stage('Clean up SLURM Agent Resources') {
172-
Utils.exec(
173-
pipeline,
174-
timeout: false,
175-
script: Utils.sshUserCmd(
176-
remote,
177-
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
178-
)
202+
203+
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
204+
205+
Utils.exec(
206+
pipeline,
207+
script: Utils.sshUserCmd(
208+
remote,
209+
"\"scancel ${slurmJobID} || true; sacct -j ${slurmJobID} --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job ${slurmJobID} || true\""
179210
)
180-
Utils.exec(pipeline, script: "echo done")
181-
}
211+
)
212+
213+
Utils.exec(
214+
pipeline,
215+
script: Utils.sshUserCmd(
216+
remote,
217+
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
218+
)
219+
)
220+
221+
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
182222
}
183223
}
184224

@@ -224,6 +264,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
224264
def customWorkspace = "/tmp/${nodeName}"
225265
def nodeSecret = CloudManager.createNode(nodeName, customWorkspace)
226266

267+
def slurmJobID = null
268+
227269
try {
228270
// Run ssh command to start node in desired cluster via SLURM
229271
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
@@ -245,22 +287,47 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
245287

246288
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh", numRetries: 3,)
247289

248-
Utils.exec(
290+
Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}")
291+
292+
def slurmSubmitOutput = Utils.exec(
249293
pipeline,
250294
timeout: false,
251295
script: Utils.sshUserCmd(
252-
remote,
253-
"""${SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)}"""
254-
)
296+
remote,
297+
"\"${SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)}\""
298+
),
299+
returnStdout: true
255300
)
301+
302+
def jobIDs = slurmSubmitOutput
303+
.readLines()
304+
.collect { it.trim() }
305+
.collectMany { line ->
306+
def ids = []
307+
def m1 = (line =~ /Submitted batch job (\d+)/)
308+
if (m1) ids << m1[0][1] // Extract the first captured group
309+
def m2 = (line =~ /srun: job (\d+) (queued|has been allocated)/)
310+
if (m2) ids << m2[0][1] // Extract the first captured group
311+
return ids
312+
}
313+
314+
slurmJobID = jobIDs ? jobIDs[-1] : null
315+
316+
if (!slurmJobID || !slurmJobID.isNumber()) {
317+
error("Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}")
318+
}
319+
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
256320
Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
257321
}
258322
}
259323

260324
stage('Checking if the Node is Online') {
261325
def counter = 0
262-
while (!CloudManager.isNodeOnline(nodeName) && counter < 12) {
263-
sleep(time: 10, unit: 'MINUTES') // Wait 10 minutes to check status of the node again
326+
// We submit the Slurm job with 5 hours timeout, and the K8S pod will be evicted after 22 hours.
327+
// Let's use 15 hours to check if the node is online, and with 2 hours buffer.
328+
while (!CloudManager.isNodeOnline(nodeName) && counter < 90) {
329+
// Wait 10 minutes to check status of the node again
330+
sleep(time: 10, unit: 'MINUTES')
264331
counter++
265332
}
266333

@@ -291,12 +358,16 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
291358
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
292359
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
293360
} else {
294-
echo "The node does not come online in 2 hours, terminating the job"
361+
error "The Slurm node does not come online in the waiting period. Terminating the job."
295362
}
296363
}
297364
} finally {
298-
cleanUpNodeResources(pipeline, cluster, nodeName)
299-
CloudManager.destroyNode(nodeName)
365+
stage('Clean up SLURM Resources') {
366+
Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
367+
CloudManager.destroyNode(nodeName)
368+
Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
369+
cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
370+
}
300371
}
301372
}
302373

@@ -315,7 +386,13 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
315386
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
316387
SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
317388

318-
def jobUID = "${cluster.host}-multi_node_test-${UUID.randomUUID().toString()}"
389+
// Create a unique suffix for the job name
390+
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
391+
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
392+
393+
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
394+
395+
def slurmOutputFile = null
319396

320397
try {
321398
// Run ssh command to start node in desired cluster via SLURM
@@ -341,7 +418,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
341418
def resourcePathNode = "/tmp"
342419
def llmSrcNode = "${resourcePathNode}/TensorRT-LLM/src"
343420
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
344-
def scriptRunNode = "${jobWorkspace}/slurm_run.sh"
421+
def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
422+
def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
423+
slurmOutputFile = "${jobWorkspace}/${jobUID}-slurm_output.log"
345424
def testListPathNode = "${jobWorkspace}/${testList}.txt"
346425
def waivesListPathNode = "${jobWorkspace}/waives.txt"
347426
def isAarch64 = config.contains("aarch64")
@@ -358,7 +437,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
358437
// Upload slurm_run_sh to Frontend node
359438
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
360439
Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
440+
361441
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}", numRetries: 3,)
442+
Utils.exec(pipeline, script: "cat ${scriptRunLocalPath}")
443+
362444
// Upload waives.txt to Frontend node
363445
def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
364446
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}", numRetries: 3,)
@@ -390,7 +472,6 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
390472
"--container-env=NVIDIA_IMEX_CHANNELS"
391473
].join(" ")
392474

393-
def scriptLaunch = "/home/svc_tensorrt/bloom/scripts/${jobUID}/slurm_launch.sh"
394475
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
395476
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
396477
def scriptContent = """#!/bin/bash
@@ -410,27 +491,33 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
410491
export MODEL_CACHE_DIR=$MODEL_CACHE_DIR
411492
export NVIDIA_IMEX_CHANNELS=0
412493
chmod +x ${scriptRunNode}
413-
${srunCmd}
494+
${srunCmd} 2>&1 | tee ${slurmOutputFile}
414495
""".stripIndent()
415496
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
416497
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
417498
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}", numRetries: 3,)
499+
Utils.exec(pipeline, script: "cat ${scriptLaunchDestPath}")
418500
}
501+
419502
stage('Run Test') {
420-
def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
421503
Utils.exec(
422504
pipeline,
423505
timeout: false,
424506
script: Utils.sshUserCmd(
425507
remote,
426-
"""bash ${scriptLaunch}"""
508+
"\"bash ${scriptLaunch}\""
427509
)
428510
)
429511
}
512+
513+
echo "Finished test stage execution."
430514
}
431515
} finally {
432516
uploadResults(pipeline, cluster, jobUID, stageName)
433-
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID)
517+
518+
stage('Clean up SLURM Resources') {
519+
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
520+
}
434521
}
435522
}
436523

@@ -559,6 +646,14 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
559646
} else {
560647
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg; fi'
561648
if (noResultIfSuccess && !stageIsFailed) {
649+
// Clean up the workspace
650+
sh """
651+
env | sort
652+
pwd && ls -alh
653+
rm -rf ./*
654+
"""
655+
656+
echo "Finished test stage execution."
562657
return
563658
}
564659
echo "noResultIfSuccess: ${noResultIfSuccess}, stageIsFailed: ${stageIsFailed}"
@@ -579,14 +674,16 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
579674
"${UPLOAD_PATH}/test-results/"
580675
)
581676
junit(testResults: "${stageName}/results*.xml")
582-
583-
// Clean up the workspace
584-
sh """
585-
env | sort
586-
pwd && ls -alh
587-
rm -rf ./*
588-
"""
589677
}
678+
679+
// Clean up the workspace
680+
sh """
681+
env | sort
682+
pwd && ls -alh
683+
rm -rf ./*
684+
"""
685+
686+
echo "Finished test stage execution."
590687
}
591688
}
592689

@@ -629,7 +726,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
629726
containerConfig = """
630727
- name: trt-llm
631728
image: ${image}
632-
command: ['sleep', ${POD_TIMEOUT_SECONDS}]
729+
command: ['sleep', ${POD_TIMEOUT_SECONDS_SLURM}]
633730
tty: true
634731
resources:
635732
requests:
@@ -647,7 +744,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
647744
containerConfig = """
648745
- name: trt-llm
649746
image: ${image}
650-
command: ['sleep', ${POD_TIMEOUT_SECONDS_TMP}]
747+
command: ['sleep', ${POD_TIMEOUT_SECONDS_BUILD}]
651748
volumeMounts:
652749
- name: sw-tensorrt-pvc
653750
mountPath: "/mnt/sw-tensorrt-pvc"
@@ -713,7 +810,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
713810
containerConfig = """
714811
- name: trt-llm
715812
image: ${image}
716-
command: ['sleep', ${POD_TIMEOUT_SECONDS}]
813+
command: ['sleep', ${POD_TIMEOUT_SECONDS_TEST}]
717814
tty: true
718815
resources:
719816
requests:
@@ -2153,10 +2250,13 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21532250
}
21542251
echo "###### Check pip install Start ######"
21552252
withEnv(libEnv) {
2253+
// Retry 2 times if timeout occurs.
21562254
sh "env | sort"
2157-
timeout(time: 30, unit: 'MINUTES') {
2158-
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
2159-
}
2255+
trtllm_utils.llmRetry(1, "checkPipInstall", {
2256+
timeout(time: 30, unit: 'MINUTES') {
2257+
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
2258+
}
2259+
})
21602260
}
21612261
echo "###### Run LLMAPI tests Start ######"
21622262
def config = VANILLA_CONFIG

0 commit comments

Comments
 (0)