Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 171 additions & 47 deletions ci/flaky_test/process_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,50 +7,150 @@
import sys
import ssl

well_known_timeouts = [60, 300, 900, 3600]
section_delimiter = "---------------------------------------------------------------------------------------------------\n"

# Check if a test suite reports failure.
def checkTestStatus(file):
tree = ET.parse(file)

# Returns a boolean indicating if a test passed.
def didTestPass(file):
tree = ET.parse(file)
root = tree.getroot()

for testsuite in root:
if (testsuite.attrib['failures'] != '0'):
if testsuite.attrib['failures'] != '0' or testsuite.attrib['errors'] != '0':
return False
return True


def parseXML(file, visited):
log_file = file.split('.')
log_file_path = ""
# Returns a pretty-printed string of a test case failure.
def printTestCaseFailure(testcase, testsuite, failure_msg, log_path):
ret = "Test flake details:\n"
ret += "- Test suite:\t{}\n".format(testsuite)
ret += "- Test case:\t{}\n".format(testcase)
ret += "- Log path:\t{}\n".format(log_path)
ret += "- Details:\n"
for line in failure_msg.splitlines():
ret += "\t" + line + "\n"
ret += section_delimiter + "\n"
return ret


# Returns a pretty-printed string of a test suite error, such as an exception or a timeout.
def printTestSuiteError(testsuite, testcase, log_path, duration, time, error_msg, output):
ret = "Test flake details:\n"
ret += "- Test suite:\t{}\n".format(testsuite)
ret += "- Test case:\t{}\n".format(testcase)
ret += "- Log path:\t{}\n".format(log_path)

errno_string = os.strerror(int(error_msg.split(' ')[-1]))
ret += "- Error:\t{} ({})\n".format(error_msg.capitalize(), errno_string)

if duration == time and duration in well_known_timeouts:
ret += "- Note:\t\tThis error is likely a timeout (test duration == {}, a well known timeout value).\n".format(
duration)

# If there's a call stack, print it. Otherwise, attempt to print the most recent,
# relevant lines.
output = output.rstrip('\n')
traceback_index = output.rfind('Traceback (most recent call last)')

if traceback_index != -1:
ret += "- Relevant snippet:\n"
for line in output[traceback_index:].splitlines():
ret += "\t" + line + "\n"
else:
# No traceback found. Attempt to print the most recent snippet from the last test case.
max_snippet_size = 20
last_testcase_index = output.rfind('[ RUN ]')
output_lines = output[last_testcase_index:].splitlines()
num_lines_to_print = min(len(output_lines), max_snippet_size)

ret += "- Last {} line(s):\n".format(num_lines_to_print)
for line in output_lines[-num_lines_to_print:]:
ret += "\t" + line + "\n"

ret += "\n" + section_delimiter + "\n"

return ret


# Parses a test suite error, such as an exception or a timeout, and returns a pretty-printed
# string of the error. This function is dependent on the structure of the XML and the contents
# of the test log and will need to be adjusted should those change.
def parseAndPrintTestSuiteError(testsuite, log_path):
error_msg = ""
test_duration = 0
test_time = 0
last_testsuite = testsuite.attrib['name']
last_testcase = testsuite.attrib['name']
test_output = ""

# Test suites with errors are expected to have 2 children elements: a generic testcase tag
# with the runtimes and a child containing the error message, and another with the entire
# output of the test suite.
for testcase in testsuite:
if testcase.tag == "testcase":
test_duration = int(testcase.attrib['duration'])
test_time = int(testcase.attrib['time'])

for child in testcase:
if child.tag == "error":
error_msg = child.attrib['message']
elif testcase.tag == "system-out":
test_output = testcase.text

# For test suites with errors like this one, the test suite and test case names were not
# parsed into the XML metadata. Here we attempt to extract those names from the log by
# finding the last test case to run. The expected format of that is:
# "[ RUN ] <TestParams>/<TestSuite>.<TestCase>\n".
last_test_fullname = test_output.split('[ RUN ]')[-1].splitlines()[0]
last_testsuite = last_test_fullname.split('/')[1].split('.')[0]
last_testcase = last_test_fullname.split('.')[1]

if error_msg != "":
return printTestSuiteError(last_testsuite, last_testcase, log_path, test_duration, test_time,
error_msg, test_output)

return ""


# Parses a failed test's XML, adds any flaky tests found to the visited set, and returns a
# well-formatted string describing all failures and errors.
def parseXML(file, visited):
# This is dependent on the fact that log files reside in the same directory
# as their corresponding xml files.
log_file = file.split('.')
log_file_path = ""
for token in log_file[:-1]:
log_file_path += token
log_file_path += ".log"

tree = ET.parse(file)

root = tree.getroot()
ret = ""

# This loop is dependent on the structure of xml file emitted for test runs.
# Should this change in the future, appropriate adjustments need to made.
# Should this change in the future, appropriate adjustments need to be made.
ret = ""
for testsuite in root:
if (testsuite.attrib['failures'] != '0'):
if testsuite.attrib['failures'] != '0':
for testcase in testsuite:
for failure_msg in testcase:
if (testcase.attrib['name'], testsuite.attrib['name']) not in visited:
ret += "-----------------------Flaky Testcase: {} in TestSuite: {} -----------------------\n".format(
testcase.attrib['name'], testsuite.attrib['name'])
ret += log_file_path + "\n" + failure_msg.text + "\n"
ret += printTestCaseFailure(testcase.attrib['name'], testsuite.attrib['name'],
failure_msg.text, log_file_path)
visited.add((testcase.attrib['name'], testsuite.attrib['name']))
elif testsuite.attrib['errors'] != '0':
# If an unexpected error occurred, such as an exception or a timeout, the test suite was
# likely not parsed into XML properly, including the suite's name and the test case that
# caused the error. More parsing is needed to extract details about the error.
if (testsuite.attrib['name'], testsuite.attrib['name']) not in visited:
ret += parseAndPrintTestSuiteError(testsuite, log_file_path)
visited.add((testsuite.attrib['name'], testsuite.attrib['name']))

return ret


# The following function links the filepath of 'test.xml' (the result for the last attempt) with
# that of its 'attmpt_n.xml' file and stores it in a dictionary for easy lookup.
# that of its 'attempt_n.xml' file and stores it in a dictionary for easy lookup.
def processFindOutput(f, problematic_tests):
for line in f:
lineList = line.split('/')
Expand All @@ -63,72 +163,94 @@ def processFindOutput(f, problematic_tests):
problematic_tests[filepath] = line.strip('\n')


# Prints out helpful information on the run using Git.
# Should Git changes the output of the used commands in the future,
# Returns helpful information on the run using Git.
# Should Git change the output of the used commands in the future,
# this will likely need adjustments as well.
def getGitInfo(CI_TARGET):
ret = ""
os.system('sh -c "git remote -v > ${TMP_OUTPUT_PROCESS_XML}"')
os.system('sh -c "git describe --all >> ${TMP_OUTPUT_PROCESS_XML}"')
os.system('sh -c "git show >> ${TMP_OUTPUT_PROCESS_XML}"')
# Fetching the URL from predefined env variable
envoy_link = os.environ["REPO_URI"]

with open(os.environ['TMP_OUTPUT_PROCESS_XML'], 'r+', encoding='utf-8') as f:
for line in [next(f) for x in range(6)]:
if line.split('/')[0] == 'remotes':
for token in line.split('/')[1:-1]:
envoy_link += '/' + token
ret += line

ret += "link for additional content: " + envoy_link + " \n"
ret += "azure build URI: " + os.environ["BUILD_URI"] + " \n"

if CI_TARGET != "":
ret += "In " + CI_TARGET + " build\n"
ret += "Target:\t\t{}\n".format(CI_TARGET)

if os.getenv('SYSTEM_STAGEDISPLAYNAME') and os.getenv('SYSTEM_STAGEJOBNAME'):
ret += "Stage:\t\t{} {}\n".format(os.environ['SYSTEM_STAGEDISPLAYNAME'],
os.environ['SYSTEM_STAGEJOBNAME'])

if os.getenv('BUILD_REASON') == "PullRequest" and os.getenv('SYSTEM_PULLREQUEST_PULLREQUESTID'):
ret += "Pull request:\t{}/pull/{}\n".format(os.environ['REPO_URI'],
os.environ['SYSTEM_PULLREQUEST_PULLREQUESTID'])
elif os.getenv('BUILD_REASON'):
ret += "Build reason:\t{}\n".format(os.environ['BUILD_REASON'])

output = subprocess.check_output(['git', 'log', '--format=%H', '-n', '1'], encoding='utf-8')
ret += "Commmit:\t{}/commit/{}".format(os.environ['REPO_URI'], output)

build_id = os.environ['BUILD_URI'].split('/')[-1]
ret += "CI results:\thttps://dev.azure.com/cncf/envoy/_build/results?buildId=" + build_id + "\n"

ret += "\n"

output = subprocess.check_output(['git', 'remote', 'get-url', 'origin'], encoding='utf-8')
ret += "Origin:\t\t{}".format(output.replace('.git', ''))

output = subprocess.check_output(['git', 'remote', 'get-url', 'upstream'], encoding='utf-8')
ret += "Upstream:\t{}".format(output.replace('.git', ''))

output = subprocess.check_output(['git', 'describe', '--all'], encoding='utf-8')
ret += "Latest ref:\t{}".format(output)

ret += "\n"

ret += "Last commit:\n"
output = subprocess.check_output(['git', 'show', '-s'], encoding='utf-8')
for line in output.splitlines():
ret += "\t" + line + "\n"

ret += section_delimiter

return ret


if __name__ == "__main__":
CI_TARGET = ""
if len(sys.argv) == 2:
CI_TARGET = sys.argv[1]
output_msg = "``` \n"
has_flaky_test = False

if os.getenv("TEST_TMPDIR") and os.getenv("REPO_URI") and os.getenv("BUILD_URI"):
if os.getenv('TEST_TMPDIR') and os.getenv('REPO_URI') and os.getenv("BUILD_URI"):
os.environ["TMP_OUTPUT_PROCESS_XML"] = os.getenv("TEST_TMPDIR") + "/tmp_output_process_xml.txt"
else:
print("set the env variables first")
print("Set the env variables TEST_TMPDIR and REPO_URI first.")
sys.exit(0)
output_msg += getGitInfo(CI_TARGET)

find_dir = '${TEST_TMPDIR}/**/**/**/**/bazel-testlogs/'
find_dir = "{}/**/**/**/**/bazel-testlogs/".format(os.environ['TEST_TMPDIR']).replace('\\', '/')
if CI_TARGET == "MacOS":
find_dir = '${TEST_TMPDIR}/'
os.system(
'sh -c "/usr/bin/find {} -name attempt_*.xml > ${{TMP_OUTPUT_PROCESS_XML}}"'.format(find_dir))

# All output of find command should be either failed or flaky tests, as only then will
# a test be rerun and have an 'attempt_n.xml' file. problematic_tests holds a lookup
# table between the last_attempt xml filepath and the failed previous attempt filepath.
# table between the most recent run's xml filepath and the original attempt's failed xml
# filepath.
problematic_tests = {}
with open(os.environ['TMP_OUTPUT_PROCESS_XML'], 'r+') as f:
processFindOutput(f, problematic_tests)

# Needed to make sure no duplicate flaky tests are going to be reported.
visited = set()

# The logic here goes as follows: If there is a test suite that has run multiple times,
# which produces attempt_*.xml files, it means that the end result of that test
# is either flaky or failed. So if we find that the last run of the test succeeds
# we know for sure that this is a flaky test.
has_flaky_test = False
failure_output = ""
flaky_tests_visited = set()
for k in problematic_tests.keys():
if checkTestStatus(k):
if didTestPass(k):
has_flaky_test = True
output_msg += parseXML(problematic_tests[k], visited)
output_msg += "``` \n"
failure_output += parseXML(problematic_tests[k], flaky_tests_visited)

if has_flaky_test:
output_msg = "``` \n" + getGitInfo(CI_TARGET) + "\n" + failure_output + "``` \n"

if os.getenv("SLACK_TOKEN"):
SLACKTOKEN = os.environ["SLACK_TOKEN"]
ssl_context = ssl.create_default_context()
Expand All @@ -140,5 +262,7 @@ def getGitInfo(CI_TARGET):
client.chat_postMessage(channel='test-flaky', text=output_msg, as_user="true")
else:
print(output_msg)
else:
print('No flaky tests found.\n')

os.remove(os.environ["TMP_OUTPUT_PROCESS_XML"])
3 changes: 3 additions & 0 deletions ci/run_envoy_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,5 +93,8 @@ docker run --rm \
-e SLACK_TOKEN \
-e BUILD_URI\
-e REPO_URI \
-e SYSTEM_STAGEDISPLAYNAME \
-e SYSTEM_JOBDISPLAYNAME \
-e SYSTEM_PULLREQUEST_PULLREQUESTID \
"${ENVOY_BUILD_IMAGE}" \
"${START_COMMAND[@]}"