@@ -7,7 +7,6 @@ import groovy.json.JsonOutput
77import  com.nvidia.bloom.KubernetesManager 
88import  com.nvidia.bloom.Constants 
99import  com.nvidia.bloom.CloudManager 
10- import  com.nvidia.bloom.KubernetesManager 
1110import  com.nvidia.bloom.SlurmConfig 
1211import  com.nvidia.bloom.SlurmCluster 
1312import  com.nvidia.bloom.SlurmPartition 
@@ -211,6 +210,13 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
211210            sh " cp ${ llmSrc} ${ stageName} " 
212211            sh " ls ${ stageName} " 
213212        })
213+ 
214+         //  Clean up the workspace
215+         sh """ 
216+             env | sort 
217+             pwd && ls -alh 
218+             rm -rf ./* 
219+         """  
214220    }
215221}
216222
@@ -219,8 +225,11 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
219225    SlurmPartition  partition =  SlurmConfig . partitionConfig[platform] as  SlurmPartition 
220226    SlurmCluster  cluster =  SlurmConfig . clusterConfig[partition. clusterName]
221227
222-     def  nodeName =  " ${ cluster.host} ${ UUID.randomUUID().toString()} " 
223-     def  nodeSecret =  CloudManager . createNode(nodeName)
228+     //  Create a unique suffix for the node name and workspace
229+     String  customSuffix =  " ${ env.BUILD_TAG} ${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
230+     def  nodeName =  " ${ cluster.host} ${ customSuffix} " 
231+     def  customWorkspace =  " /tmp/${ nodeName} " 
232+     def  nodeSecret =  CloudManager . createNode(nodeName, customWorkspace)
224233
225234    try  {
226235        //  Run ssh command to start node in desired cluster via SLURM
@@ -263,12 +272,30 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
263272            }
264273
265274            if  (CloudManager . isNodeOnline(nodeName)) {
266-                 def  dockerArgs =  " --gpus ${ gpuCount} ${ CCACHE_DIR} " 
275+                 node(nodeName) {
276+                     sh """ 
277+                         env | sort 
278+                         pwd && ls -alh 
279+                         ls -alh ${ env.WORKSPACE}  
280+                         ls -alh ${ env.WORKSPACE_TMP}  
281+                     """  
282+                 }
283+ 
284+                 def  dockerArgs =  " --gpus ${ gpuCount} " + 
285+                     " --cap-add=SYS_ADMIN " + 
286+                     " --ipc=host " + 
287+                     " --security-opt seccomp=unconfined " + 
288+                     " -u root:root " + 
289+                     " -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " + 
290+                     " -v /tmp/ccache:${ CCACHE_DIR} " + 
291+                     " -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " + 
292+                     " --cap-add syslog" 
267293
268294                if  (partition. clusterName ==  " dlcluster" 
269295                    dockerArgs + =  "  -e NVIDIA_IMEX_CHANNELS=0" 
270296                }
271-                 slurmRunner =  runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, false )
297+ 
298+                 slurmRunner =  runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, true )
272299                executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
273300            } else  {
274301                echo " The node does not come online in 2 hours, terminating the job" 
@@ -560,6 +587,13 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
560587                " ${ UPLOAD_PATH} " 
561588            )
562589            junit(testResults : " ${ stageName} " 
590+ 
591+             //  Clean up the workspace
592+             sh """ 
593+                 env | sort 
594+                 pwd && ls -alh 
595+                 rm -rf ./* 
596+             """  
563597        }
564598    }
565599}
@@ -796,7 +830,7 @@ def echoNodeAndGpuInfo(pipeline, stageName)
796830
797831def  runLLMDocBuild (pipeline , config )
798832{
799-     //  Step 1: cloning tekit  source code
833+     //  Step 1: cloning source code
800834    sh " pwd && ls -alh" 
801835    sh " env | sort" 
802836    //  allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
@@ -1241,13 +1275,16 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
12411275
12421276def  runLLMTestlistOnPlatformImpl (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" splitId = 1 , splits = 1 , skipInstallWheel = false , cpver = " cp312" 
12431277{
1244-     //  Step 1: create LLM_ROOT dir
1245-     sh " pwd && ls -alh" 
1246-     //  TODO: proper way to clean workspace, maybe save in a folder named with BUILD_ID.
1247-     //  So that it can work with multiple job running in same node
1248-     sh " rm -rf ./*" 
1278+     //  Step 1: create LLM_ROOT dir and clean up the workspace
12491279    def  llmRootConfig =  " ${ LLM_ROOT}${ config} " 
1250-     sh " mkdir ${ llmRootConfig} " 
1280+     sh """ 
1281+         env | sort 
1282+         pwd && ls -alh 
1283+         rm -rf ./* 
1284+         mkdir ${ llmRootConfig}  
1285+         ls -alh ${ env.WORKSPACE}  
1286+         ls -alh ${ env.WORKSPACE_TMP}  
1287+     """  
12511288
12521289    def  llmPath =  sh (script : " realpath ${ llmRootConfig} " returnStdout : true ). trim()
12531290    def  llmSrc =  " ${ llmPath} " 
@@ -1562,6 +1599,13 @@ def runLLMTestlistOnPlatform(pipeline, platform, testList, config=VANILLA_CONFIG
15621599        sh " cp ${ llmSrc} ${ stageName} " 
15631600        sh " ls ${ stageName} " 
15641601    })
1602+ 
1603+     //  Clean up the workspace
1604+     sh """ 
1605+         env | sort 
1606+         pwd && ls -alh 
1607+         rm -rf ./* 
1608+     """  
15651609}
15661610
15671611
@@ -1890,12 +1934,8 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
18901934    fullSet + =  SBSATestConfigs . keySet()
18911935
18921936    SBSASlurmTestConfigs  =  [
1893-         " GB200-PyTorch-1" " gb200-unrestricted" " l0_gb200" 1 , 3 ],
1894-         " GB200-PyTorch-2" " gb200-unrestricted" " l0_gb200" 2 , 3 ],
1895-         " GB200-PyTorch-3" " gb200-unrestricted" " l0_gb200" 3 , 3 ],
1896-         " GB200-TensorRT-1" " gb200-unrestricted" " l0_gb200" 1 , 2 ],
1897-         " GB200-TensorRT-2" " gb200-unrestricted" " l0_gb200" 2 , 2 ],
1898-         " GB200-Triton-Post-Merge-1" " gb200-unrestricted" " l0_gb200" 1 , 1 ],
1937+         //  Not used in the pipeline now
1938+         //  "GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 3],
18991939        " GB200-4_GPUs-PyTorch-1" " gb200-x4" " l0_gb200_multi_gpus" 1 , 1 , 4 ],
19001940        " GB200-4_GPUs-PyTorch-Post-Merge-1" " gb200-x4" " l0_gb200_multi_gpus" 1 , 1 , 4 ],
19011941    ]
@@ -1909,7 +1949,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
19091949        " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-4" " gb200-multi-node" " l0_gb200_multi_nodes" 4 , 7 , 8 , 2 ],
19101950        " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-5" " gb200-multi-node" " l0_gb200_multi_nodes" 5 , 7 , 8 , 2 ],
19111951        " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-6" " gb200-multi-node" " l0_gb200_multi_nodes" 6 , 7 , 8 , 2 ],
1912-         " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-7" " gb200-multi-node" " l0_gb200_multi_nodes" 7 , 7 , 8 , 2 ],
19131952    ]
19141953    fullSet + =  multiNodesSBSAConfigs. keySet()
19151954
@@ -2129,7 +2168,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21292168                        echo " ###### Check pip install Start ######" 
21302169                        withEnv(libEnv) {
21312170                            sh " env | sort" 
2132-                             checkPipInstall(pipeline, " ${ cpu_arch} ${ wheelPath} " 
2171+                             timeout(time : 1 , unit : ' HOURS' 
2172+                                 checkPipInstall(pipeline, " ${ cpu_arch} ${ wheelPath} " 
2173+                             }
21332174                        }
21342175                        echo " ###### Run LLMAPI tests Start ######" 
21352176                        def  config =  VANILLA_CONFIG 
@@ -2464,7 +2505,7 @@ pipeline {
24642505
24652506                    def  testPhase2StageName =  env. testPhase2StageName
24662507                    if  (testPhase2StageName) {
2467-                         def  dgxSigns =  [" DGX_H100 " " DGX_H200 " ,  " GB200- 4_GPUs" " GB200- 8_GPUs" ,  " DGX_B200 " ,  " RTXPro6000-4_GPUs " 
2508+                         def  dgxSigns =  [" 2_GPUs " " 4_GPUs" " 8_GPUs" 
24682509                        singleGpuJobs =  parallelJobs. findAll{!dgxSigns .any {sign  ->  it. key. contains(sign)}}
24692510                        dgxJobs =  parallelJobs. findAll{dgxSigns .any {sign  ->  it. key. contains(sign)}}
24702511                    }
0 commit comments