Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,17 @@ if(ENABLE_UCX)
if(NOT ${ucx_FOUND})
set(ENABLE_UCX 0)
else()
if(DEFINED ENV{GITHUB_MIRROR} AND NOT "$ENV{GITHUB_MIRROR}" STREQUAL "")
if(EXISTS "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake")
file(READ "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake" FILE_CONTENTS)
string(
REPLACE "https://raw.githubusercontent.com/rapidsai/rapids-cmake"
"$ENV{GITHUB_MIRROR}/rapidsai/rapids-cmake/raw/refs/heads"
FILE_CONTENTS "${FILE_CONTENTS}")
file(WRITE "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake" "${FILE_CONTENTS}")
message(WARNING "Replace UCXX fetch_rapids.cmake with internal mirror")
endif()
endif()
# installing ucxx via add_subdirectory results in strange cudart linking
# error, thus using their installation script to isolate the installation
# process until the issue is understood. And always trigger the build so
Expand Down
21 changes: 12 additions & 9 deletions jenkins/BuildDockerImage.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def buildImage(config, imageKeyToTag)
// Step 2: Build the images
stage ("Install packages") {
sh "pwd && ls -alh"
sh "env"
sh "env | sort"
sh "apk add make git"
sh "git config --global --add safe.directory '*'"

Expand All @@ -281,12 +281,12 @@ def buildImage(config, imageKeyToTag)
try {
def build_jobs = BUILD_JOBS
// Fix the triton image pull timeout issue
def BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG BASE_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
def BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG BASE_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()

if (target == "rockylinux8") {
BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'jenkins-rockylinux8_%: BASE_IMAGE =' docker/Makefile | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^jenkins-rockylinux8_%: BASE_IMAGE =' docker/Makefile | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
}

// Replace the base image and triton image with the internal mirror
Expand All @@ -295,7 +295,8 @@ def buildImage(config, imageKeyToTag)

if (dependent) {
stage ("make ${dependent.target}_${action} (${arch})") {
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
def randomSleep = (Math.random() * 300 + 300).toInteger()
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, shortCommondRunTimeMax: 7200)
trtllm_utils.llmExecStepWithRetry(this, script: """
cd ${LLM_ROOT} && make -C docker ${dependent.target}_${action} \
BASE_IMAGE=${BASE_IMAGE} \
Expand All @@ -304,7 +305,7 @@ def buildImage(config, imageKeyToTag)
IMAGE_WITH_TAG=${dependentImageWithTag} \
STAGE=${dependent.dockerfileStage} \
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
""", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
""", sleepInSecs: randomSleep, numRetries: 3, shortCommondRunTimeMax: 7200)
args += " DEVEL_IMAGE=${dependentImageWithTag}"
if (target == "ngc-release") {
imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag
Expand All @@ -322,7 +323,9 @@ def buildImage(config, imageKeyToTag)
}
}
stage ("make ${target}_${action} (${arch})") {
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
sh "env | sort"
def randomSleep = (Math.random() * 300 + 300).toInteger()
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, shortCommondRunTimeMax: 7200)
trtllm_utils.llmExecStepWithRetry(this, script: """
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
BASE_IMAGE=${BASE_IMAGE} \
Expand All @@ -331,7 +334,7 @@ def buildImage(config, imageKeyToTag)
IMAGE_WITH_TAG=${imageWithTag} \
STAGE=${dockerfileStage} \
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
""", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
""", sleepInSecs: randomSleep, numRetries: 3, shortCommondRunTimeMax: 7200)
if (target == "ngc-release") {
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
}
Expand Down
28 changes: 15 additions & 13 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false

COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"

def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def remote = [
Expand All @@ -113,7 +115,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
pipeline.stage('Submit Test Results') {
sh "mkdir -p ${stageName}"
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
if (downloadSucceed) {
sh "ls ${stageName}"
Expand Down Expand Up @@ -239,7 +241,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p

Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)

Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)

Utils.exec(
pipeline,
Expand Down Expand Up @@ -327,7 +329,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL

stage('Prepare Testing') {
// Create Job Workspace folder in Frontend Node
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh -oStrictHostKeyChecking=no ${remote.user}@${remote.host} 'mkdir ${jobWorkspace}'",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host} 'mkdir -p ${jobWorkspace}'",)

// Download and Unzip Tar File
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
Expand All @@ -336,11 +338,11 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
// Upload slurm_run_sh to Frontend node
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)

// Upload waives.txt to Frontend node
def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)

// Generate Test List and Upload to Frontend Node
def makoArgs = getMakoArgsFromStageName(stageName, true)
Expand All @@ -349,7 +351,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
// if the line cannot be split by "=", just ignore that line.
def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs)
def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)

// Generate Multi Node Job Launch Script
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
Expand Down Expand Up @@ -393,7 +395,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
""".stripIndent()
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
}
stage('Run Test') {
def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
Expand Down Expand Up @@ -1089,7 +1091,7 @@ def getSSHConnectionPorts(portConfigFile, stageName)
usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'),
string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME')
]) {
portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'netstat -tuln'",returnStdout: true)
portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'netstat -tuln'", returnStdout: true)
}
echo "Port Usage: ${portUsage}"

Expand Down Expand Up @@ -1248,7 +1250,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
def llmRootConfig = "${LLM_ROOT}${config}"
sh "mkdir ${llmRootConfig}"

def llmPath = sh (script: "realpath ${llmRootConfig}",returnStdout: true).trim()
def llmPath = sh (script: "realpath ${llmRootConfig}", returnStdout: true).trim()
def llmSrc = "${llmPath}/TensorRT-LLM/src"
echoNodeAndGpuInfo(pipeline, stageName)

Expand Down Expand Up @@ -1362,9 +1364,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'),
string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME')
]) {
sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub"
sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys"
sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}"
sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub"
sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys"
sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}"

def (int userPort, int monitorPort) = getSSHConnectionPorts(portConfigFilePath, stageName)
if (userPort == 0) {
Expand All @@ -1373,7 +1375,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
return
}

sh "ssh -f -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
sh "ssh -f ${COMMON_SSH_OPTIONS} -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
sh "autossh -fNR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
sh "ps aux | grep ssh"
try {
Expand Down