Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 15 additions & 13 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false

COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"

def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def remote = [
Expand All @@ -113,7 +115,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
pipeline.stage('Submit Test Results') {
sh "mkdir -p ${stageName}"
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
if (downloadSucceed) {
sh "ls ${stageName}"
Expand Down Expand Up @@ -239,7 +241,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p

Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)

Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)

Utils.exec(
pipeline,
Expand Down Expand Up @@ -327,7 +329,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL

stage('Prepare Testing') {
// Create Job Workspace folder in Frontend Node
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh -oStrictHostKeyChecking=no ${remote.user}@${remote.host} 'mkdir ${jobWorkspace}'",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host} 'mkdir -p ${jobWorkspace}'",)

// Download and Unzip Tar File
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
Expand All @@ -336,11 +338,11 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
// Upload slurm_run_sh to Frontend node
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)

// Upload waives.txt to Frontend node
def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)

// Generate Test List and Upload to Frontend Node
def makoArgs = getMakoArgsFromStageName(stageName, true)
Expand All @@ -349,7 +351,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
// if the line cannot be split by "=", just ignore that line.
def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs)
def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)

// Generate Multi Node Job Launch Script
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
Expand Down Expand Up @@ -393,7 +395,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
""".stripIndent()
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
}
stage('Run Test') {
def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
Expand Down Expand Up @@ -1089,7 +1091,7 @@ def getSSHConnectionPorts(portConfigFile, stageName)
usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'),
string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME')
]) {
portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'netstat -tuln'",returnStdout: true)
portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'netstat -tuln'", returnStdout: true)
}
echo "Port Usage: ${portUsage}"

Expand Down Expand Up @@ -1248,7 +1250,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
def llmRootConfig = "${LLM_ROOT}${config}"
sh "mkdir ${llmRootConfig}"

def llmPath = sh (script: "realpath ${llmRootConfig}",returnStdout: true).trim()
def llmPath = sh (script: "realpath ${llmRootConfig}", returnStdout: true).trim()
def llmSrc = "${llmPath}/TensorRT-LLM/src"
echoNodeAndGpuInfo(pipeline, stageName)

Expand Down Expand Up @@ -1362,9 +1364,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'),
string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME')
]) {
sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub"
sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys"
sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}"
sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub"
sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys"
sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}"

def (int userPort, int monitorPort) = getSSHConnectionPorts(portConfigFilePath, stageName)
if (userPort == 0) {
Expand All @@ -1373,7 +1375,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
return
}

sh "ssh -f -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
sh "ssh -f ${COMMON_SSH_OPTIONS} -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
sh "autossh -fNR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
sh "ps aux | grep ssh"
try {
Expand Down