diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index de37cfbf551..198aad41ac7 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -80,6 +80,8 @@ def trimForStageList(stageNameList) return trimedList } +@Field +def REUSE_TEST = "reuse_test" @Field def REUSE_STAGE_LIST = "reuse_stage_list" @Field @@ -114,6 +116,7 @@ def DEBUG_MODE = "debug" def DETAILED_LOG = "detailed_log" def testFilter = [ + (REUSE_TEST): gitlabParamsFromBot.get(REUSE_TEST, null), (REUSE_STAGE_LIST): trimForStageList(gitlabParamsFromBot.get(REUSE_STAGE_LIST, null)?.tokenize(',')), (ENABLE_SKIP_TEST): gitlabParamsFromBot.get((ENABLE_SKIP_TEST), false), (TEST_STAGE_LIST): trimForStageList(gitlabParamsFromBot.get((TEST_STAGE_LIST), null)?.tokenize(',')), diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index dea0b1dc4f7..6a5ccd3d33a 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -728,6 +728,7 @@ def getNodeArgs(int nodeCount, int gpuCount) { def getPytestBaseCommandLine( String llmSrc, String stageName, + String waivesFilePath, Boolean perfMode, String outputPath, String trtllmWheelPath, @@ -748,6 +749,7 @@ def getPytestBaseCommandLine( "LLM_BACKEND_ROOT=${llmSrc}/triton_backend", "LLM_MODELS_ROOT=${MODEL_CACHE_DIR}", "MODEL_CACHE_DIR=${MODEL_CACHE_DIR}", + "COLUMNS=200", extraInternalEnv, pytestUtil, "pytest", @@ -758,7 +760,7 @@ def getPytestBaseCommandLine( "--timeout=${pytestTestTimeout}", "--rootdir ${llmSrc}/tests/integration/defs", "--test-prefix=${stageName}", - "--waives-file=${llmSrc}/tests/integration/test_lists/waives.txt", + "--waives-file=${waivesFilePath}", "--output-dir=${outputPath}/", "--csv=${outputPath}/report.csv", "--junit-xml ${outputPath}/results.xml", @@ -871,6 +873,21 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG testListPathNode ) + // Download and Merge waives.txt + mergeWaivesTxt(pipeline, llmSrcLocal, stageName) + + // Add passed test list from previous pipeline run to the waives.txt + if (testFilter[(REUSE_TEST)] != false) { + reusePassedTestResults(llmSrcLocal, stageName, testListPathLocal, "${llmSrcLocal}/tests/integration/test_lists/waives.txt") + } + + Utils.copyFileToRemoteHost( + pipeline, + remote, + "${llmSrcLocal}/tests/integration/test_lists/waives.txt", + waivesListPathNode + ) + // generate .coveragerc in workspace and add file path to pytest command sh """ touch ./.coveragerc @@ -898,6 +915,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG def pytestCommand = getPytestBaseCommandLine( llmSrcNode, stageName, + waivesListPathNode, perfMode, jobWorkspace, "__PLACEHOLDER_TRTLLM_WHL_PATH__", @@ -1046,6 +1064,8 @@ def trimForStageList(stageNameList) // Test filter flags @Field +def REUSE_TEST = "reuse_test" +@Field def REUSE_STAGE_LIST = "reuse_stage_list" @Field def ENABLE_SKIP_TEST = "skip_test" @@ -1077,6 +1097,7 @@ def DEBUG_MODE = "debug" def DETAILED_LOG = "detailed_log" @Field def testFilter = [ + (REUSE_TEST): null, (REUSE_STAGE_LIST): null, (ENABLE_SKIP_TEST): false, (TEST_STAGE_LIST): null, @@ -1998,6 +2019,76 @@ def generateRerunReport(stageName, llmSrc) { echo "Rerun report generation completed for stage: ${stageName}" } +def mergeWaivesTxt(pipeline, llmSrc, stageName) { + def waivesTxt = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/waive_list/waives.txt" + try { + trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget -nv ${waivesTxt}") + if (!fileExists("waives.txt")) { + error "There is no merged waives.txt file, use the default waives.txt." + } + sh "rm ${llmSrc}/tests/integration/test_lists/waives.txt" + sh "mv waives.txt ${llmSrc}/tests/integration/test_lists/waives.txt" + echo "Download merged waives.txt successfully" + } catch (InterruptedException e) { + throw e + } catch (Exception e) { + echo "Failed to download merged waives.txt, use the default waives.txt. Error: ${e.message}" + } +} + +def reusePassedTestResults(llmSrc, stageName, testDBList, waivesTxt) { + try { + // Get passed test list from open search + def passedTestListFile = "${WORKSPACE}/${stageName}/passed_test_list.txt" + sh """ + python3 ${llmSrc}/jenkins/scripts/open_search_query.py \ + --commit-id ${env.gitlabCommit} \ + --stage-name ${stageName} \ + --output-file ${passedTestListFile} + """ + + def passedTestList = readFile(file: passedTestListFile).readLines() + // Read the original test list + def originalTestLines = readFile(file: testDBList).readLines() + + def reusedTests = [] + for (originalLine in originalTestLines) { + def testLine = originalLine.trim() + if (testLine) { + for (passedTest in passedTestList) { + passedTest = passedTest.trim() + if (testLine.contains(passedTest)) { + reusedTests.add(passedTest) + break + } + } + } + } + + // Append reused tests to waives.txt + if (reusedTests.size() > 0) { + sh(label: "Reused Tests", script: "echo \"Reused tests:\n${reusedTests.join('\n')}\"") + + // Build the content to append + def reusedTestsContent = reusedTests.collect { test -> + "${test} SKIP (Reused from previous pipeline)" + }.join('\n') + + // Use heredoc to append content directly without intermediate files + sh """ + cat >> ${waivesTxt} << 'REUSED_TESTS_EOF' + ${reusedTestsContent} + REUSED_TESTS_EOF + """ + sh(label: "Updated Waives File", script: "echo \"Appended ${reusedTests.size()} reused tests to ${waivesTxt}\"") + } + } catch (InterruptedException e) { + throw e + } catch (Exception e) { + echo "Failed to add passed test list from previous pipeline run to the waives.txt. Error: ${e.message}" + } +} + def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312") { // Step 1: create LLM_ROOT dir and clean up the workspace @@ -2052,22 +2143,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}") sh "cd ${llmPath} && tar -zxf ${tarName}" - // Download the new merged waives.txt - def waivesTxt = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/waive_list/waives.txt" - try { - trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget -nv ${waivesTxt}") - if (!fileExists("waives.txt")) { - error "There is no merged waives.txt file, use the default waives.txt." - } - sh "rm ${llmSrc}/tests/integration/test_lists/waives.txt" - sh "mv waives.txt ${llmSrc}/tests/integration/test_lists/waives.txt" - echo "Download merged waives.txt successfully" - } catch (InterruptedException e) { - throw e - } catch (Exception e) { - echo "Failed to download merged waives.txt, use the default waives.txt. Error: ${e.message}" - } - // install python package if (env.alternativeTRT) { sh "cd ${llmSrc} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt" @@ -2168,6 +2243,14 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO def testDBList = renderTestDB(testList, llmSrc, stageName) + // Download and Merge waives.txt + mergeWaivesTxt(pipeline, llmSrc, stageName) + + // Add passed test list from previous pipeline run to the waives.txt + if (testFilter[(REUSE_TEST)] != false) { + reusePassedTestResults(llmSrc, stageName, testDBList, "${llmSrc}/tests/integration/test_lists/waives.txt") + } + // Process shard test list and create separate files for regular and isolate tests def preprocessedLists = processShardTestList(llmSrc, testDBList, splitId, splits, perfMode) @@ -2190,6 +2273,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO def pytestCommand = getPytestBaseCommandLine( llmSrc, stageName, + "${llmSrc}/tests/integration/test_lists/waives.txt", perfMode, "${WORKSPACE}/${stageName}", TRTLLM_WHL_PATH, diff --git a/jenkins/scripts/open_search_query.py b/jenkins/scripts/open_search_query.py new file mode 100644 index 00000000000..6c0f142163e --- /dev/null +++ b/jenkins/scripts/open_search_query.py @@ -0,0 +1,83 @@ +import argparse +import json +import os +import sys + +import requests + +OPEN_SEARCH_QUERY_URL = ( + "http://gpuwa.nvidia.com/opensearch/df-swdl-trtllm-infra-ci-prod-test_info-*/_search" +) +headers = {"Content-Type": "application/json", "Accept-Charset": "UTF-8"} + + +def queryJobEvents(commitID="", stageName="", onlySuccess=True): + mustConditions = [] + if commitID: + mustConditions.append({"term": {"s_trigger_mr_commit": commitID}}) + if stageName: + mustConditions.append({"term": {"s_stage_name": stageName}}) + if onlySuccess: + mustConditions.append({"term": {"s_status": "PASSED"}}) + + all_results = [] + page_size = 1000 + from_index = 0 + + while True: + requestBody = { + "query": {"bool": {"must": mustConditions}}, + "_source": [ + "s_job_name", + "s_status", + "s_build_id", + "s_turtle_name", + "s_test_name", + "s_gpu_type", + ], + "size": page_size, + "from": from_index, + "sort": [{"_id": "asc"}], + } + + formattedRequestBody = json.dumps(requestBody) + response = requests.post(OPEN_SEARCH_QUERY_URL, headers=headers, data=formattedRequestBody) + data = response.json() + + hits = data["hits"]["hits"] + if not hits: + break + + all_results.extend(hits) + from_index += page_size + + print(f"Fetched {len(all_results)} records...") + + return all_results + + +def writeTestListToFile(testList, fileName): + os.makedirs(os.path.dirname(fileName), exist_ok=True) + + with open(fileName, "w") as f: + for test in testList: + f.write(test + "\n") + + +def getPassedTestList(commitID, stageName, outputFile): + hits = queryJobEvents(commitID=commitID, stageName=stageName, onlySuccess=True) + testList = [] + for hit in hits: + testList.append(hit["_source"]["s_turtle_name"]) + writeTestListToFile(testList, outputFile) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--commit-id", required=True, help="Commit ID") + parser.add_argument("--stage-name", required=True, help="Stage Name") + parser.add_argument("--output-file", required=True, help="Output File") + args = parser.parse_args(sys.argv[1:]) + getPassedTestList( + commitID=args.commit_id, stageName=args.stage_name, outputFile=args.output_file + ) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index c1d501b0143..54198643e30 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2126,6 +2126,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, enable_lm_head_tp_in_adp, cuda_graph, overlap_scheduler, max_batch_size, moe_backend): + pytest.fail("test reuse") if moe_backend == "TRTLLM" and (get_sm_version() == 120 or get_sm_version() == 121): pytest.skip( diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 71c396ea528..9ac938bc1c8 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1661,6 +1661,7 @@ def test_openai_responses(llm_root, llm_venv): def test_openai_prometheus(llm_root, llm_venv): + pytest.fail("test reuse") test_root = unittest_path() / "llmapi" / "apps" llm_venv.run_cmd( ["-m", "pytest",