@@ -7,7 +7,6 @@ import groovy.json.JsonOutput
77import com.nvidia.bloom.KubernetesManager
88import com.nvidia.bloom.Constants
99import com.nvidia.bloom.CloudManager
10- import com.nvidia.bloom.KubernetesManager
1110import com.nvidia.bloom.SlurmConfig
1211import com.nvidia.bloom.SlurmCluster
1312import com.nvidia.bloom.SlurmPartition
@@ -219,8 +218,11 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
219218 SlurmPartition partition = SlurmConfig . partitionConfig[platform] as SlurmPartition
220219 SlurmCluster cluster = SlurmConfig . clusterConfig[partition. clusterName]
221220
222- def nodeName = " ${ cluster.host} -test-${ UUID.randomUUID().toString()} "
223- def nodeSecret = CloudManager . createNode(nodeName)
221+ // Create a unique suffix for the node name and workspace
222+ String customSuffix = " ${ env.BUILD_TAG} -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
223+ def nodeName = " ${ cluster.host} -test-${ customSuffix} "
224+ def customWorkspace = " /tmp/${ nodeName} "
225+ def nodeSecret = CloudManager . createNode(nodeName, customWorkspace)
224226
225227 try {
226228 // Run ssh command to start node in desired cluster via SLURM
@@ -263,12 +265,30 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
263265 }
264266
265267 if (CloudManager . isNodeOnline(nodeName)) {
266- def dockerArgs = " --gpus ${ gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${ CCACHE_DIR} :rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
268+ node(nodeName) {
269+ sh """
270+ env | sort
271+ pwd && ls -alh
272+ ls -alh ${ env.WORKSPACE}
273+ ls -alh ${ env.WORKSPACE_TMP}
274+ """
275+ }
276+
277+ def dockerArgs = " --gpus ${ gpuCount} " +
278+ " --cap-add=SYS_ADMIN " +
279+ " --ipc=host " +
280+ " --security-opt seccomp=unconfined " +
281+ " -u root:root " +
282+ " -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
283+ " -v /tmp/ccache:${ CCACHE_DIR} :rw " +
284+ " -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
285+ " --cap-add syslog"
267286
268287 if (partition. clusterName == " dlcluster" ) {
269288 dockerArgs + = " -e NVIDIA_IMEX_CHANNELS=0"
270289 }
271- slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, false )
290+
291+ slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, true )
272292 executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
273293 } else {
274294 echo " The node does not come online in 2 hours, terminating the job"
@@ -560,6 +580,13 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
560580 " ${ UPLOAD_PATH} /test-results/"
561581 )
562582 junit(testResults : " ${ stageName} /results*.xml" )
583+
584+ // Clean up the workspace
585+ sh """
586+ env | sort
587+ pwd && ls -alh
588+ rm -rf ./*
589+ """
563590 }
564591 }
565592}
@@ -796,7 +823,7 @@ def echoNodeAndGpuInfo(pipeline, stageName)
796823
797824def runLLMDocBuild (pipeline , config )
798825{
799- // Step 1: cloning tekit source code
826+ // Step 1: cloning source code
800827 sh " pwd && ls -alh"
801828 sh " env | sort"
802829 // allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
@@ -1241,13 +1268,16 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
12411268
12421269def runLLMTestlistOnPlatformImpl (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" , splitId = 1 , splits = 1 , skipInstallWheel = false , cpver = " cp312" )
12431270{
1244- // Step 1: create LLM_ROOT dir
1245- sh " pwd && ls -alh"
1246- // TODO: proper way to clean workspace, maybe save in a folder named with BUILD_ID.
1247- // So that it can work with multiple job running in same node
1248- sh " rm -rf ./*"
1271+ // Step 1: create LLM_ROOT dir and clean up the workspace
12491272 def llmRootConfig = " ${ LLM_ROOT}${ config} "
1250- sh " mkdir ${ llmRootConfig} "
1273+ sh """
1274+ env | sort
1275+ pwd && ls -alh
1276+ rm -rf ./*
1277+ mkdir ${ llmRootConfig}
1278+ ls -alh ${ env.WORKSPACE}
1279+ ls -alh ${ env.WORKSPACE_TMP}
1280+ """
12511281
12521282 def llmPath = sh (script : " realpath ${ llmRootConfig} " , returnStdout : true ). trim()
12531283 def llmSrc = " ${ llmPath} /TensorRT-LLM/src"
@@ -1890,12 +1920,8 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
18901920 fullSet + = SBSATestConfigs . keySet()
18911921
18921922 SBSASlurmTestConfigs = [
1893- " GB200-PyTorch-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 3 ],
1894- " GB200-PyTorch-2" : [" gb200-unrestricted" , " l0_gb200" , 2 , 3 ],
1895- " GB200-PyTorch-3" : [" gb200-unrestricted" , " l0_gb200" , 3 , 3 ],
1896- " GB200-TensorRT-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 2 ],
1897- " GB200-TensorRT-2" : [" gb200-unrestricted" , " l0_gb200" , 2 , 2 ],
1898- " GB200-Triton-Post-Merge-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 1 ],
1923+ // Not used in the pipeline now
1924+ // "GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 3],
18991925 " GB200-4_GPUs-PyTorch-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
19001926 " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
19011927 ]
@@ -1909,7 +1935,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
19091935 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-4" : [" gb200-multi-node" , " l0_gb200_multi_nodes" , 4 , 7 , 8 , 2 ],
19101936 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-5" : [" gb200-multi-node" , " l0_gb200_multi_nodes" , 5 , 7 , 8 , 2 ],
19111937 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-6" : [" gb200-multi-node" , " l0_gb200_multi_nodes" , 6 , 7 , 8 , 2 ],
1912- " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-7" : [" gb200-multi-node" , " l0_gb200_multi_nodes" , 7 , 7 , 8 , 2 ],
19131938 ]
19141939 fullSet + = multiNodesSBSAConfigs. keySet()
19151940
@@ -2129,7 +2154,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21292154 echo " ###### Check pip install Start ######"
21302155 withEnv(libEnv) {
21312156 sh " env | sort"
2132- checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2157+ timeout(time : 1 , unit : ' HOURS' ) {
2158+ checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2159+ }
21332160 }
21342161 echo " ###### Run LLMAPI tests Start ######"
21352162 def config = VANILLA_CONFIG
@@ -2464,7 +2491,7 @@ pipeline {
24642491
24652492 def testPhase2StageName = env. testPhase2StageName
24662493 if (testPhase2StageName) {
2467- def dgxSigns = [" DGX_H100 " , " DGX_H200 " , " GB200- 4_GPUs" , " GB200- 8_GPUs" , " DGX_B200 " , " RTXPro6000-4_GPUs " ]
2494+ def dgxSigns = [" 2_GPUs " , " 4_GPUs" , " 8_GPUs" ]
24682495 singleGpuJobs = parallelJobs. findAll{!dgxSigns .any {sign -> it. key. contains(sign)}}
24692496 dgxJobs = parallelJobs. findAll{dgxSigns .any {sign -> it. key. contains(sign)}}
24702497 }
0 commit comments