Skip to content

Commit 96d75d8

Browse files
committed
[None][ci] Remove some stale CI codes
Signed-off-by: Yanchao Lu <[email protected]>
1 parent b0558c7 commit 96d75d8

File tree

3 files changed

+25
-43
lines changed

3 files changed

+25
-43
lines changed

jenkins/L0_Test.groovy

Lines changed: 23 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
154154
"-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
155155
"-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
156156
"-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
157+
"-e 's/.*SLURM_JOB_ID=\\([0-9]\\+\\).*/\\1/p' " +
158+
"-e 's/.*SLURM_JOBID=\\([0-9]\\+\\).*/\\1/p' " +
157159
"${slurmOutputFile} | tail -n1 || true\""
158160
),
159161
returnStdout: true
@@ -183,6 +185,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
183185
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\""))
184186
echo "Slurm job did not submit successfully. No job ID found."
185187
} else {
188+
// The original Slurm output file name is like "slurm-%j-*.out", we need to replace the %j with the real job ID.
186189
def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID)
187190
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\""))
188191
}
@@ -317,6 +320,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
317320
if (m1) ids << m1[0][1] // Extract the first captured group
318321
def m2 = (line =~ /srun: job (\d+) (queued|has been allocated)/)
319322
if (m2) ids << m2[0][1] // Extract the first captured group
323+
def m3 = (line =~ /SLURM_JOB_ID=(\d+)/)
324+
if (m3) ids << m3[0][1] // Extract the first captured group
325+
def m4 = (line =~ /SLURM_JOBID=(\d+)/)
326+
if (m4) ids << m4[0][1] // Extract the first captured group
320327
return ids
321328
}
322329

@@ -370,12 +377,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
370377
error "The Slurm node does not come online in the waiting period. Terminating the job."
371378
}
372379
}
373-
} catch (Exception e) {
374-
if (e.getMessage()?.contains("Failed to kill container")) {
375-
echo "Known benign error ignored: ${e.getMessage()}"
376-
} else {
377-
throw e // Re-throw if it's a different IOException
378-
}
379380
} finally {
380381
stage("Clean up SLURM Resources") {
381382
// Workaround to handle the interruption during clean up SLURM resources
@@ -1013,7 +1014,7 @@ def launchTestListCheck(pipeline)
10131014
trtllm_utils.llmExecStepWithRetry(pipeline, script: """apt-get update && apt-get install \
10141015
libffi-dev \
10151016
-y""")
1016-
sh "nvidia-smi -q"
1017+
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
10171018
// download TRT-LLM tarfile
10181019
def tarName = BUILD_CONFIGS[VANILLA_CONFIG][TARNAME]
10191020
def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}"
@@ -1421,8 +1422,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
14211422
sh "nproc && free -g && hostname"
14221423
echoNodeAndGpuInfo(pipeline, stageName)
14231424
sh "cat ${MODEL_CACHE_DIR}/README"
1424-
sh "nvidia-smi -q"
1425-
sh "nvidia-smi topo -m"
1425+
sh "nvidia-smi && nvidia-smi s-q && nvidia-smi topo -m"
14261426
sh "df -h"
14271427

14281428
// setup HF_HOME to cache model and datasets
@@ -1798,7 +1798,7 @@ def runPackageSanityCheck(pipeline, wheel_path, reinstall_dependencies=false, cp
17981798
sh "nproc && free -g && hostname"
17991799
sh "bash -c 'pip3 show tensorrt || true'"
18001800
sh "cat ${MODEL_CACHE_DIR}/README"
1801-
sh "nvidia-smi -q"
1801+
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
18021802

18031803
sh "pwd && ls -alh"
18041804
trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget -nv ${whlUrl}")
@@ -1849,7 +1849,6 @@ def checkStageName(stageNames) {
18491849
}
18501850
}
18511851

1852-
// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
18531852
def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
18541853
{
18551854
return {
@@ -1863,18 +1862,11 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
18631862
docker.image(image).inside(dockerArgs) {
18641863
runner()
18651864
}
1866-
}
1867-
}
1868-
}
1869-
1870-
def runInDockerOnNode(image, label, dockerArgs)
1871-
{
1872-
return {
1873-
stageName, runner -> stage(stageName) {
1874-
node(label) {
1875-
deleteDir()
1876-
docker.image(image).inside(dockerArgs) {
1877-
runner()
1865+
catch (Exception e) {
1866+
if (e.getMessage()?.contains("Failed to kill container")) {
1867+
echo "Known benign error ignored: ${e.getMessage()}"
1868+
} else {
1869+
throw e // Re-throw if it's a different IOException
18781870
}
18791871
}
18801872
}
@@ -1893,10 +1885,8 @@ def runInKubernetes(pipeline, podSpec, containerName)
18931885
}
18941886
}
18951887

1896-
def launchTestJobs(pipeline, testFilter, dockerNode=null)
1888+
def launchTestJobs(pipeline, testFilter)
18971889
{
1898-
def dockerArgs = "-v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
1899-
19001890
// IMPORTANT: Stage Configuration Syntax Requirement
19011891
//
19021892
// The test_to_stage_mapping.py script expects stage definitions in the following format:
@@ -2044,7 +2034,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
20442034
fullSet += SBSATestConfigs.keySet()
20452035

20462036
SBSASlurmTestConfigs = [
2047-
"GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 1],
2037+
// Should use "gb200-single" instead of "gb200-x4" for single GPU testing
2038+
"GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 2],
2039+
"GB200-PyTorch-2": ["gb200-x4", "l0_gb200", 2, 2],
20482040
"GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
20492041
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
20502042
]
@@ -2198,12 +2190,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21982190
def buildRunner = runInKubernetes(pipeline, buildSpec, "trt-llm")
21992191
def sanityRunner = null
22002192

2201-
if (dockerNode) {
2202-
sanityRunner = runInDockerOnNode(values[0], dockerNode, dockerArgs)
2203-
} else {
2204-
def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch)
2205-
sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm")
2206-
}
2193+
2194+
def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch)
2195+
sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm")
22072196

22082197
def wheelPath = "${values[4]}"
22092198
def wheelName = ""
@@ -2447,17 +2436,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
24472436
stage("Skip - reused") {
24482437
echo "Skip - Passed in the last pipeline."
24492438
}
2450-
} else if (values instanceof List && dockerNode == null) {
2439+
} else if (values instanceof List) {
24512440
trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", {
24522441
values[1]()
24532442
})
2454-
} else if (values instanceof List && dockerNode != null) {
2455-
node(dockerNode) {
2456-
deleteDir()
2457-
docker.image(LLM_DOCKER_IMAGE).inside(dockerArgs) {
2458-
values[1]()
2459-
}
2460-
}
24612443
} else {
24622444
values()
24632445
}

jenkins/scripts/slurm_run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ if [ $SLURM_LOCALID -eq 0 ]; then
2222
which python3
2323
python3 --version
2424
apt-get install -y libffi-dev
25-
nvidia-smi
25+
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
2626
cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt
2727
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
2828
git config --global --add safe.directory "*"

tests/integration/test_lists/test-db/l0_gb200.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ l0_gb200:
44
ranges:
55
system_gpu_count:
66
gte: 1
7-
lte: 1
7+
lte: 4
88
wildcards:
99
gpu:
1010
- '*gb200*'

0 commit comments

Comments
 (0)