1- @Library ([' bloom-jenkins-shared-lib@dev-yanchaol-slurm-output '  , ' trtllm-jenkins-shared-lib@main'  ]) _
1+ @Library ([' bloom-jenkins-shared-lib@main '  , ' trtllm-jenkins-shared-lib@main'  ]) _
22
33import  java.lang.InterruptedException 
44import  groovy.transform.Field 
@@ -245,7 +245,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
245245
246246                Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd}  ' scp -r -p ${ COMMON_SSH_OPTIONS}   ${ jenkinsSetupPath}   ${ remote.user}  @${ remote.host}  :~/bloom/scripts/${ nodeName}  -slurm_jenkins_agent_setup.sh"  ,)
247247
248-                 sh( label :  " Print slurm_jenkins_agent_setup.sh script "  , script : " cat ${ jenkinsSetupPath} "  )
248+                 Utils . exec(pipeline , script : " cat ${ jenkinsSetupPath} "  )
249249
250250                Utils . exec(
251251                    pipeline,
@@ -297,8 +297,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
297297            }
298298        }
299299    } finally  {
300-         cleanUpNodeResources (pipeline, cluster, nodeName )
300+         Utils . exec (pipeline, script :  " echo Sleeping to allow docker stop; sleep 30 "  )
301301        CloudManager . destroyNode(nodeName)
302+         Utils . exec(pipeline, script : " echo Sleeping to allow node destruction; sleep 30"  )
303+         cleanUpNodeResources(pipeline, cluster, nodeName)
302304    }
303305}
304306
@@ -321,13 +323,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
321323    String  customSuffix =  " ${ env.BUILD_TAG}  -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
322324    def  jobUID =  " ${ cluster.host}  -multi_node_test-${ customSuffix} " 
323325
324-     sh(
325-         label : " Print env for debugging"  ,
326-         script : """ 
327-             env | sort 
328-             pwd && ls -alh 
329-         """  
330-     )
326+     Utils . exec(pipeline, script : " env | sort && pwd && ls -alh"  )
331327
332328    try  {
333329        //  Run ssh command to start node in desired cluster via SLURM
@@ -371,7 +367,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
371367                def  scriptRunLocalPath =  " ${ llmSrcLocal}  /jenkins/scripts/slurm_run.sh" 
372368                Utils . exec(pipeline, script : " chmod +x ${ scriptRunLocalPath} "  , returnStdout : true )
373369                Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd}  ' scp -r -p ${ COMMON_SSH_OPTIONS}   ${ scriptRunLocalPath}   ${ remote.user}  @${ remote.host}  :${ scriptRunNode} "  ,)
374-                 sh( label :  " Print slurm_run.sh script "  , script : " cat ${ scriptRunLocalPath} "  )
370+                 Utils . exec(pipeline , script : " cat ${ scriptRunLocalPath} "  )
375371
376372                //  Upload waives.txt to Frontend node
377373                def  waivesListLocalPath =  " ${ llmSrcLocal}  /tests/integration/test_lists/waives.txt" 
@@ -429,7 +425,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
429425                pipeline. writeFile(file : scriptLaunchDestPath, text : scriptContent)
430426                Utils . exec(pipeline, script : " chmod +x ${ scriptLaunchDestPath} "  , returnStdout : true )
431427                Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd}  ' scp -r -p ${ COMMON_SSH_OPTIONS}   ${ scriptLaunchDestPath}   ${ remote.user}  @${ remote.host}  :${ scriptLaunch} "  ,)
432-                 sh( label :  " Print slurm_launch.sh script "  , script : " cat ${ scriptLaunchDestPath} "  )
428+                 Utils . exec(pipeline , script : " cat ${ scriptLaunchDestPath} "  )
433429            }
434430            stage(' Run Test'  ) {
435431                def  scriptLaunch =  " ${ jobWorkspace}  /slurm_launch.sh" 
@@ -2169,10 +2165,19 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21692165                            libEnv + =  [" LD_LIBRARY_PATH+nvrtc=/usr/local/lib/python${ pyver}  /dist-packages/nvidia/cuda_nvrtc/lib"  ]
21702166                        }
21712167                        echo " ###### Check pip install Start ######" 
2168+                         def  sleepTime =  20 
21722169                        withEnv(libEnv) {
2173-                             sh " env | sort" 
2174-                             timeout(time : 30 , unit : ' MINUTES'  ) {
2175-                                 checkPipInstall(pipeline, " ${ cpu_arch}  /${ wheelPath} "  )
2170+                             retry(1 ) {
2171+                                 try  {
2172+                                     sh " env | sort" 
2173+                                     timeout(time : 30 , unit : ' MINUTES'  ) {
2174+                                         sleep(sleepTime *  60 )
2175+                                         sleepTime =  1 
2176+                                         checkPipInstall(pipeline, " ${ cpu_arch}  /${ wheelPath} "  )
2177+                                     }
2178+                                 } catch  (org.jenkinsci.plugins.workflow.steps.TimeoutStepExecution.ExceededTimeout  e) {
2179+                                     error " Timeout occurred, retrying..." 
2180+                                 }
21762181                            }
21772182                        }
21782183                        echo " ###### Run LLMAPI tests Start ######" 
0 commit comments