diff --git a/ci/flaky_test/process_xml.py b/ci/flaky_test/process_xml.py index 49eb537b95b6f..e226ba626f618 100755 --- a/ci/flaky_test/process_xml.py +++ b/ci/flaky_test/process_xml.py @@ -7,50 +7,150 @@ import sys import ssl +well_known_timeouts = [60, 300, 900, 3600] +section_delimiter = "---------------------------------------------------------------------------------------------------\n" -# Check if a test suite reports failure. -def checkTestStatus(file): - tree = ET.parse(file) +# Returns a boolean indicating if a test passed. +def didTestPass(file): + tree = ET.parse(file) root = tree.getroot() - for testsuite in root: - if (testsuite.attrib['failures'] != '0'): + if testsuite.attrib['failures'] != '0' or testsuite.attrib['errors'] != '0': return False return True -def parseXML(file, visited): - log_file = file.split('.') - log_file_path = "" +# Returns a pretty-printed string of a test case failure. +def printTestCaseFailure(testcase, testsuite, failure_msg, log_path): + ret = "Test flake details:\n" + ret += "- Test suite:\t{}\n".format(testsuite) + ret += "- Test case:\t{}\n".format(testcase) + ret += "- Log path:\t{}\n".format(log_path) + ret += "- Details:\n" + for line in failure_msg.splitlines(): + ret += "\t" + line + "\n" + ret += section_delimiter + "\n" + return ret + + +# Returns a pretty-printed string of a test suite error, such as an exception or a timeout. +def printTestSuiteError(testsuite, testcase, log_path, duration, time, error_msg, output): + ret = "Test flake details:\n" + ret += "- Test suite:\t{}\n".format(testsuite) + ret += "- Test case:\t{}\n".format(testcase) + ret += "- Log path:\t{}\n".format(log_path) + + errno_string = os.strerror(int(error_msg.split(' ')[-1])) + ret += "- Error:\t{} ({})\n".format(error_msg.capitalize(), errno_string) + + if duration == time and duration in well_known_timeouts: + ret += "- Note:\t\tThis error is likely a timeout (test duration == {}, a well known timeout value).\n".format( + duration) + + # If there's a call stack, print it. Otherwise, attempt to print the most recent, + # relevant lines. + output = output.rstrip('\n') + traceback_index = output.rfind('Traceback (most recent call last)') + + if traceback_index != -1: + ret += "- Relevant snippet:\n" + for line in output[traceback_index:].splitlines(): + ret += "\t" + line + "\n" + else: + # No traceback found. Attempt to print the most recent snippet from the last test case. + max_snippet_size = 20 + last_testcase_index = output.rfind('[ RUN ]') + output_lines = output[last_testcase_index:].splitlines() + num_lines_to_print = min(len(output_lines), max_snippet_size) + + ret += "- Last {} line(s):\n".format(num_lines_to_print) + for line in output_lines[-num_lines_to_print:]: + ret += "\t" + line + "\n" + + ret += "\n" + section_delimiter + "\n" + + return ret + + +# Parses a test suite error, such as an exception or a timeout, and returns a pretty-printed +# string of the error. This function is dependent on the structure of the XML and the contents +# of the test log and will need to be adjusted should those change. +def parseAndPrintTestSuiteError(testsuite, log_path): + error_msg = "" + test_duration = 0 + test_time = 0 + last_testsuite = testsuite.attrib['name'] + last_testcase = testsuite.attrib['name'] + test_output = "" + + # Test suites with errors are expected to have 2 children elements: a generic testcase tag + # with the runtimes and a child containing the error message, and another with the entire + # output of the test suite. + for testcase in testsuite: + if testcase.tag == "testcase": + test_duration = int(testcase.attrib['duration']) + test_time = int(testcase.attrib['time']) + + for child in testcase: + if child.tag == "error": + error_msg = child.attrib['message'] + elif testcase.tag == "system-out": + test_output = testcase.text + + # For test suites with errors like this one, the test suite and test case names were not + # parsed into the XML metadata. Here we attempt to extract those names from the log by + # finding the last test case to run. The expected format of that is: + # "[ RUN ] /.\n". + last_test_fullname = test_output.split('[ RUN ]')[-1].splitlines()[0] + last_testsuite = last_test_fullname.split('/')[1].split('.')[0] + last_testcase = last_test_fullname.split('.')[1] + + if error_msg != "": + return printTestSuiteError(last_testsuite, last_testcase, log_path, test_duration, test_time, + error_msg, test_output) + + return "" + +# Parses a failed test's XML, adds any flaky tests found to the visited set, and returns a +# well-formatted string describing all failures and errors. +def parseXML(file, visited): # This is dependent on the fact that log files reside in the same directory # as their corresponding xml files. + log_file = file.split('.') + log_file_path = "" for token in log_file[:-1]: log_file_path += token log_file_path += ".log" tree = ET.parse(file) - root = tree.getroot() - ret = "" # This loop is dependent on the structure of xml file emitted for test runs. - # Should this change in the future, appropriate adjustments need to made. + # Should this change in the future, appropriate adjustments need to be made. + ret = "" for testsuite in root: - if (testsuite.attrib['failures'] != '0'): + if testsuite.attrib['failures'] != '0': for testcase in testsuite: for failure_msg in testcase: if (testcase.attrib['name'], testsuite.attrib['name']) not in visited: - ret += "-----------------------Flaky Testcase: {} in TestSuite: {} -----------------------\n".format( - testcase.attrib['name'], testsuite.attrib['name']) - ret += log_file_path + "\n" + failure_msg.text + "\n" + ret += printTestCaseFailure(testcase.attrib['name'], testsuite.attrib['name'], + failure_msg.text, log_file_path) visited.add((testcase.attrib['name'], testsuite.attrib['name'])) + elif testsuite.attrib['errors'] != '0': + # If an unexpected error occurred, such as an exception or a timeout, the test suite was + # likely not parsed into XML properly, including the suite's name and the test case that + # caused the error. More parsing is needed to extract details about the error. + if (testsuite.attrib['name'], testsuite.attrib['name']) not in visited: + ret += parseAndPrintTestSuiteError(testsuite, log_file_path) + visited.add((testsuite.attrib['name'], testsuite.attrib['name'])) + return ret # The following function links the filepath of 'test.xml' (the result for the last attempt) with -# that of its 'attmpt_n.xml' file and stores it in a dictionary for easy lookup. +# that of its 'attempt_n.xml' file and stores it in a dictionary for easy lookup. def processFindOutput(f, problematic_tests): for line in f: lineList = line.split('/') @@ -63,28 +163,51 @@ def processFindOutput(f, problematic_tests): problematic_tests[filepath] = line.strip('\n') -# Prints out helpful information on the run using Git. -# Should Git changes the output of the used commands in the future, +# Returns helpful information on the run using Git. +# Should Git change the output of the used commands in the future, # this will likely need adjustments as well. def getGitInfo(CI_TARGET): ret = "" - os.system('sh -c "git remote -v > ${TMP_OUTPUT_PROCESS_XML}"') - os.system('sh -c "git describe --all >> ${TMP_OUTPUT_PROCESS_XML}"') - os.system('sh -c "git show >> ${TMP_OUTPUT_PROCESS_XML}"') - # Fetching the URL from predefined env variable - envoy_link = os.environ["REPO_URI"] - - with open(os.environ['TMP_OUTPUT_PROCESS_XML'], 'r+', encoding='utf-8') as f: - for line in [next(f) for x in range(6)]: - if line.split('/')[0] == 'remotes': - for token in line.split('/')[1:-1]: - envoy_link += '/' + token - ret += line - - ret += "link for additional content: " + envoy_link + " \n" - ret += "azure build URI: " + os.environ["BUILD_URI"] + " \n" + if CI_TARGET != "": - ret += "In " + CI_TARGET + " build\n" + ret += "Target:\t\t{}\n".format(CI_TARGET) + + if os.getenv('SYSTEM_STAGEDISPLAYNAME') and os.getenv('SYSTEM_STAGEJOBNAME'): + ret += "Stage:\t\t{} {}\n".format(os.environ['SYSTEM_STAGEDISPLAYNAME'], + os.environ['SYSTEM_STAGEJOBNAME']) + + if os.getenv('BUILD_REASON') == "PullRequest" and os.getenv('SYSTEM_PULLREQUEST_PULLREQUESTID'): + ret += "Pull request:\t{}/pull/{}\n".format(os.environ['REPO_URI'], + os.environ['SYSTEM_PULLREQUEST_PULLREQUESTID']) + elif os.getenv('BUILD_REASON'): + ret += "Build reason:\t{}\n".format(os.environ['BUILD_REASON']) + + output = subprocess.check_output(['git', 'log', '--format=%H', '-n', '1'], encoding='utf-8') + ret += "Commmit:\t{}/commit/{}".format(os.environ['REPO_URI'], output) + + build_id = os.environ['BUILD_URI'].split('/')[-1] + ret += "CI results:\thttps://dev.azure.com/cncf/envoy/_build/results?buildId=" + build_id + "\n" + + ret += "\n" + + output = subprocess.check_output(['git', 'remote', 'get-url', 'origin'], encoding='utf-8') + ret += "Origin:\t\t{}".format(output.replace('.git', '')) + + output = subprocess.check_output(['git', 'remote', 'get-url', 'upstream'], encoding='utf-8') + ret += "Upstream:\t{}".format(output.replace('.git', '')) + + output = subprocess.check_output(['git', 'describe', '--all'], encoding='utf-8') + ret += "Latest ref:\t{}".format(output) + + ret += "\n" + + ret += "Last commit:\n" + output = subprocess.check_output(['git', 'show', '-s'], encoding='utf-8') + for line in output.splitlines(): + ret += "\t" + line + "\n" + + ret += section_delimiter + return ret @@ -92,17 +215,14 @@ def getGitInfo(CI_TARGET): CI_TARGET = "" if len(sys.argv) == 2: CI_TARGET = sys.argv[1] - output_msg = "``` \n" - has_flaky_test = False - if os.getenv("TEST_TMPDIR") and os.getenv("REPO_URI") and os.getenv("BUILD_URI"): + if os.getenv('TEST_TMPDIR') and os.getenv('REPO_URI') and os.getenv("BUILD_URI"): os.environ["TMP_OUTPUT_PROCESS_XML"] = os.getenv("TEST_TMPDIR") + "/tmp_output_process_xml.txt" else: - print("set the env variables first") + print("Set the env variables TEST_TMPDIR and REPO_URI first.") sys.exit(0) - output_msg += getGitInfo(CI_TARGET) - find_dir = '${TEST_TMPDIR}/**/**/**/**/bazel-testlogs/' + find_dir = "{}/**/**/**/**/bazel-testlogs/".format(os.environ['TEST_TMPDIR']).replace('\\', '/') if CI_TARGET == "MacOS": find_dir = '${TEST_TMPDIR}/' os.system( @@ -110,25 +230,27 @@ def getGitInfo(CI_TARGET): # All output of find command should be either failed or flaky tests, as only then will # a test be rerun and have an 'attempt_n.xml' file. problematic_tests holds a lookup - # table between the last_attempt xml filepath and the failed previous attempt filepath. + # table between the most recent run's xml filepath and the original attempt's failed xml + # filepath. problematic_tests = {} with open(os.environ['TMP_OUTPUT_PROCESS_XML'], 'r+') as f: processFindOutput(f, problematic_tests) - # Needed to make sure no duplicate flaky tests are going to be reported. - visited = set() - # The logic here goes as follows: If there is a test suite that has run multiple times, # which produces attempt_*.xml files, it means that the end result of that test # is either flaky or failed. So if we find that the last run of the test succeeds # we know for sure that this is a flaky test. + has_flaky_test = False + failure_output = "" + flaky_tests_visited = set() for k in problematic_tests.keys(): - if checkTestStatus(k): + if didTestPass(k): has_flaky_test = True - output_msg += parseXML(problematic_tests[k], visited) - output_msg += "``` \n" + failure_output += parseXML(problematic_tests[k], flaky_tests_visited) if has_flaky_test: + output_msg = "``` \n" + getGitInfo(CI_TARGET) + "\n" + failure_output + "``` \n" + if os.getenv("SLACK_TOKEN"): SLACKTOKEN = os.environ["SLACK_TOKEN"] ssl_context = ssl.create_default_context() @@ -140,5 +262,7 @@ def getGitInfo(CI_TARGET): client.chat_postMessage(channel='test-flaky', text=output_msg, as_user="true") else: print(output_msg) + else: + print('No flaky tests found.\n') os.remove(os.environ["TMP_OUTPUT_PROCESS_XML"]) diff --git a/ci/run_envoy_docker.sh b/ci/run_envoy_docker.sh index c73222a24f587..34a85bb3586db 100755 --- a/ci/run_envoy_docker.sh +++ b/ci/run_envoy_docker.sh @@ -93,5 +93,8 @@ docker run --rm \ -e SLACK_TOKEN \ -e BUILD_URI\ -e REPO_URI \ + -e SYSTEM_STAGEDISPLAYNAME \ + -e SYSTEM_JOBDISPLAYNAME \ + -e SYSTEM_PULLREQUEST_PULLREQUESTID \ "${ENVOY_BUILD_IMAGE}" \ "${START_COMMAND[@]}"