diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 43b86df937f..4a8c8e9267f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -495,6 +495,17 @@ if(ENABLE_UCX) if(NOT ${ucx_FOUND}) set(ENABLE_UCX 0) else() + if(DEFINED ENV{GITHUB_MIRROR} AND NOT "$ENV{GITHUB_MIRROR}" STREQUAL "") + if(EXISTS "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake") + file(READ "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake" FILE_CONTENTS) + string( + REPLACE "https://raw.githubusercontent.com/rapidsai/rapids-cmake" + "$ENV{GITHUB_MIRROR}/rapidsai/rapids-cmake/raw/refs/heads" + FILE_CONTENTS "${FILE_CONTENTS}") + file(WRITE "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake" "${FILE_CONTENTS}") + message(WARNING "Replace UCXX fetch_rapids.cmake with internal mirror") + endif() + endif() # installing ucxx via add_subdirectory results in strange cudart linking # error, thus using their installation script to isolate the installation # process until the issue is understood. And always trigger the build so diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy index fea7d8261c9..64e03de476a 100644 --- a/jenkins/BuildDockerImage.groovy +++ b/jenkins/BuildDockerImage.groovy @@ -258,7 +258,7 @@ def buildImage(config, imageKeyToTag) // Step 2: Build the images stage ("Install packages") { sh "pwd && ls -alh" - sh "env" + sh "env | sort" sh "apk add make git" sh "git config --global --add safe.directory '*'" @@ -281,12 +281,12 @@ def buildImage(config, imageKeyToTag) try { def build_jobs = BUILD_JOBS // Fix the triton image pull timeout issue - def BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG BASE_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() - def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() - def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() + def BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG BASE_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() + def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() + def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() if (target == "rockylinux8") { - BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'jenkins-rockylinux8_%: BASE_IMAGE =' docker/Makefile | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() + BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^jenkins-rockylinux8_%: BASE_IMAGE =' docker/Makefile | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() } // Replace the base image and triton image with the internal mirror @@ -295,7 +295,8 @@ def buildImage(config, imageKeyToTag) if (dependent) { stage ("make ${dependent.target}_${action} (${arch})") { - trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: 300, shortCommondRunTimeMax: 7200) + def randomSleep = (Math.random() * 300 + 300).toInteger() + trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, shortCommondRunTimeMax: 7200) trtllm_utils.llmExecStepWithRetry(this, script: """ cd ${LLM_ROOT} && make -C docker ${dependent.target}_${action} \ BASE_IMAGE=${BASE_IMAGE} \ @@ -304,7 +305,7 @@ def buildImage(config, imageKeyToTag) IMAGE_WITH_TAG=${dependentImageWithTag} \ STAGE=${dependent.dockerfileStage} \ BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} - """, sleepInSecs: 300, shortCommondRunTimeMax: 7200) + """, sleepInSecs: randomSleep, numRetries: 3, shortCommondRunTimeMax: 7200) args += " DEVEL_IMAGE=${dependentImageWithTag}" if (target == "ngc-release") { imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag @@ -322,7 +323,9 @@ def buildImage(config, imageKeyToTag) } } stage ("make ${target}_${action} (${arch})") { - trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: 300, shortCommondRunTimeMax: 7200) + sh "env | sort" + def randomSleep = (Math.random() * 300 + 300).toInteger() + trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, shortCommondRunTimeMax: 7200) trtllm_utils.llmExecStepWithRetry(this, script: """ cd ${LLM_ROOT} && make -C docker ${target}_${action} \ BASE_IMAGE=${BASE_IMAGE} \ @@ -331,7 +334,7 @@ def buildImage(config, imageKeyToTag) IMAGE_WITH_TAG=${imageWithTag} \ STAGE=${dockerfileStage} \ BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} - """, sleepInSecs: 300, shortCommondRunTimeMax: 7200) + """, sleepInSecs: randomSleep, numRetries: 3, shortCommondRunTimeMax: 7200) if (target == "ngc-release") { imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag } diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 22a55693606..315eb151295 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -99,6 +99,8 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models" ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false +COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" + def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){ withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { def remote = [ @@ -113,7 +115,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st pipeline.stage('Submit Test Results') { sh "mkdir -p ${stageName}" def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml" - def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/" + def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/" def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0 if (downloadSucceed) { sh "ls ${stageName}" @@ -239,7 +241,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true) - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",) Utils.exec( pipeline, @@ -327,7 +329,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL stage('Prepare Testing') { // Create Job Workspace folder in Frontend Node - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh -oStrictHostKeyChecking=no ${remote.user}@${remote.host} 'mkdir ${jobWorkspace}'",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host} 'mkdir -p ${jobWorkspace}'",) // Download and Unzip Tar File trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}") @@ -336,11 +338,11 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL // Upload slurm_run_sh to Frontend node def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh" Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true) - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",) // Upload waives.txt to Frontend node def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt" - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",) // Generate Test List and Upload to Frontend Node def makoArgs = getMakoArgsFromStageName(stageName, true) @@ -349,7 +351,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL // if the line cannot be split by "=", just ignore that line. def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs) def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson) - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",) // Generate Multi Node Job Launch Script def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#") @@ -393,7 +395,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL """.stripIndent() pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent) Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true) - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",) } stage('Run Test') { def scriptLaunch = "${jobWorkspace}/slurm_launch.sh" @@ -1089,7 +1091,7 @@ def getSSHConnectionPorts(portConfigFile, stageName) usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'), string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME') ]) { - portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'netstat -tuln'",returnStdout: true) + portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'netstat -tuln'", returnStdout: true) } echo "Port Usage: ${portUsage}" @@ -1248,7 +1250,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO def llmRootConfig = "${LLM_ROOT}${config}" sh "mkdir ${llmRootConfig}" - def llmPath = sh (script: "realpath ${llmRootConfig}",returnStdout: true).trim() + def llmPath = sh (script: "realpath ${llmRootConfig}", returnStdout: true).trim() def llmSrc = "${llmPath}/TensorRT-LLM/src" echoNodeAndGpuInfo(pipeline, stageName) @@ -1362,9 +1364,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'), string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME') ]) { - sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub" - sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys" - sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}" + sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub" + sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys" + sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}" def (int userPort, int monitorPort) = getSSHConnectionPorts(portConfigFilePath, stageName) if (userPort == 0) { @@ -1373,7 +1375,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO return } - sh "ssh -f -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}" + sh "ssh -f ${COMMON_SSH_OPTIONS} -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}" sh "autossh -fNR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}" sh "ps aux | grep ssh" try {