diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index 9a7637fed18..97571fb5123 100644 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -32,6 +32,7 @@ import re import tempfile import easybuild.tools.environment as env +from collections import namedtuple from distutils.version import LooseVersion from easybuild.easyblocks.generic.pythonpackage import PythonPackage from easybuild.framework.easyconfig import CUSTOM @@ -42,6 +43,94 @@ from easybuild.tools.systemtools import POWER, get_cpu_architecture +def get_count_for_pattern(regex, text): + """Match the regexp containing a single group and return the integer value of the matched group. + Return zero if no or more than 1 match was found and warn for the latter case + """ + match = re.findall(regex, text) + if len(match) == 1: + return int(match[0]) + elif len(match) > 1: + # Shouldn't happen, but means something went wrong with the regular expressions. + # Throw warning, as the build might be fine, no need to error on this. + warn_msg = "Error in counting the number of test failures in the output of the PyTorch test suite.\n" + warn_msg += "Please check the EasyBuild log to verify the number of failures (if any) was acceptable." + print_warning(warn_msg) + return 0 + + +def extract_failed_tests_info(tests_out): + """ + Extract information about failed tests from output produced by PyTorch tests + """ + + # Create clear summary report + failure_report = "" + failure_cnt = 0 + error_cnt = 0 + failed_test_suites = [] + + # Look for patterns like: + # Ran 219 tests in 67.325s + # + # FAILED (errors=10, skipped=190, expected failures=6) + # test_fx failed! + regex = (r"^Ran (?P[0-9]+) tests.*$\n\n" + r"FAILED \((?P.*)\)$\n" + r"(?:^(?:(?!failed!).)*$\n)*" + r"(?P.*) failed!(?: Received signal: \w+)?\s*$") + + for match in re.finditer(regex, tests_out, re.M): + # E.g. 'failures=3, errors=10, skipped=190, expected failures=6' + failure_summary, test_cnt, test_suite = match.group('failure_summary', 'test_cnt', 'failed_test_suite_name') + failure_report += test_suite + ' (' + test_cnt + " total tests, " + failure_summary + ')\n' + failure_cnt += get_count_for_pattern(r"(?.*) in [0-9]+\.*[0-9]*[a-zA-Z]* =+$\n(?P.*) failed!$" + + for match in re.finditer(regex, tests_out, re.M): + # E.g. '2 failed, 128 passed, 2 skipped, 2 warnings' + failure_summary, test_suite = match.group('failure_summary', 'failed_test_suite_name') + failure_report += test_suite + ' ' + failure_summary + '\n' + failure_cnt += get_count_for_pattern(r"([0-9]+) failed", failure_summary) + error_cnt += get_count_for_pattern(r"([0-9]+) error", failure_summary) + failed_test_suites.append(test_suite) + + # Look for patterns like: + # AssertionError: 4 unit test(s) failed: + # ... + # distributed/test_c10d_gloo failed! + regex = (r"^AssertionError: (?P[0-9]+) unit test\(s\) failed:$\n" + r"(?:^(?:(?!failed!).)*$\n)*" + r"(?P.*) failed!$") + for match in re.finditer(regex, tests_out, re.M): + test_group, failed_test_cnt = match.group('failed_test_group_name', 'failed_test_cnt') + failure_report += test_group + ' (' + failed_test_cnt + " failed tests)\n" + failure_cnt += int(failed_test_cnt) + failed_test_suites.append(test_group) + + # Look for patterns like: + # Running test_jit_cuda_fuser ... [2023-01-12 04:04:08.949222] + # ... + # AttributeError: 'NoneType' object has no attribute 'split' + # test_jit_cuda_fuser failed! + regex = (r"^Running (?Ptest_[^\s]+) .*\n" + r"(?:^(?:(?!(Ran [0-9]+|failed!)).)*$\n)*" + r"\1 failed!$") + for match in re.finditer(regex, tests_out, re.M): + test_name = match.group('test_name') + failure_report += test_name + " (unknown failed test count)\n" + failure_cnt += 1 + failed_test_suites.append(test_name) + + TestsInfo = namedtuple('TestsInfo', ('failure_report', 'failure_cnt', 'error_cnt', 'failed_test_suites')) + return TestsInfo(failure_report, failure_cnt, error_cnt, failed_test_suites) + + class EB_PyTorch(PythonPackage): """Support for building/installing PyTorch.""" @@ -268,65 +357,11 @@ def test_step(self): tests_out, tests_ec = super(EB_PyTorch, self).test_step(return_output_ec=True) - def get_count_for_pattern(regex, text): - """Match the regexp containing a single group and return the integer value of the matched group. - Return zero if no or more than 1 match was found and warn for the latter case - """ - match = re.findall(regex, text) - if len(match) == 1: - return int(match[0]) - elif len(match) > 1: - # Shouldn't happen, but means something went wrong with the regular expressions. - # Throw warning, as the build might be fine, no need to error on this. - warn_msg = "Error in counting the number of test failures in the output of the PyTorch test suite.\n" - warn_msg += "Please check the EasyBuild log to verify the number of failures (if any) was acceptable." - print_warning(warn_msg) - return 0 - - # Create clear summary report - failure_report = "" - failure_cnt = 0 - error_cnt = 0 - failed_test_suites = [] - - # Grep for patterns like: - # Ran 219 tests in 67.325s - # - # FAILED (errors=10, skipped=190, expected failures=6) - # test_fx failed! - regex = (r"^Ran (?P[0-9]+) tests.*$\n\n" - r"FAILED \((?P.*)\)$\n" - r"(?:^(?:(?!failed!).)*$\n)*" - r"(?P.*) failed!(?: Received signal: \w+)?\s*$") - - for m in re.finditer(regex, tests_out, re.M): - # E.g. 'failures=3, errors=10, skipped=190, expected failures=6' - failure_summary = m.group('failure_summary') - total, test_suite = m.group('test_cnt', 'failed_test_suite_name') - failure_report += "{test_suite} ({total} total tests, {failure_summary})\n".format( - test_suite=test_suite, total=total, failure_summary=failure_summary - ) - failure_cnt += get_count_for_pattern(r"(?.*) in [0-9]+\.*[0-9]*[a-zA-Z]* =+$\n(?P.*) failed!$" - - for m in re.finditer(regex, tests_out, re.M): - # E.g. '2 failed, 128 passed, 2 skipped, 2 warnings' - failure_summary = m.group('failure_summary') - test_suite = m.group('failed_test_suite_name') - failure_report += "{test_suite} ({failure_summary})\n".format( - test_suite=test_suite, failure_summary=failure_summary - ) - failure_cnt += get_count_for_pattern(r"([0-9]+) failed", failure_summary) - error_cnt += get_count_for_pattern(r"([0-9]+) error", failure_summary) - failed_test_suites.append(test_suite) + failed_tests_info = extract_failed_tests_info(tests_out) + failure_report = failed_tests_info.failure_report # Make the names unique and sorted - failed_test_suites = sorted(set(failed_test_suites)) + failed_test_suites = sorted(set(failed_tests_info.failed_test_suites)) # Gather all failed tests suites in case we missed any (e.g. when it exited due to syntax errors) # Also unique and sorted to be able to compare the lists below all_failed_test_suites = sorted(set( @@ -341,17 +376,17 @@ def get_count_for_pattern(regex, text): failure_report += '\n' + failure_report_save # Calculate total number of unsuccesful and total tests - failed_test_cnt = failure_cnt + error_cnt + failed_test_cnt = failed_tests_info.failure_cnt + failed_tests_info.error_cnt test_cnt = sum(int(hit) for hit in re.findall(r"^Ran (?P[0-9]+) tests in", tests_out, re.M)) + max_failed_tests = self.cfg['max_failed_tests'] + self.log.info("%d unsuccessful tests (out of %d), - max. failed tests set to %d", + failed_test_cnt, test_cnt, max_failed_tests) if failed_test_cnt > 0: - max_failed_tests = self.cfg['max_failed_tests'] - - failure_or_failures = 'failure' if failure_cnt == 1 else 'failures' - error_or_errors = 'error' if error_cnt == 1 else 'errors' - msg = "%d test %s, %d test %s (out of %d):\n" % ( - failure_cnt, failure_or_failures, error_cnt, error_or_errors, test_cnt - ) + failure_or_failures = 'failure' if failed_tests_info.failure_cnt == 1 else 'failures' + msg = "%d test %s, " % (failed_tests_info.failure_cnt, failure_or_failures) + error_or_errors = 'error' if failed_tests_info.error_cnt == 1 else 'errors' + msg += "%d test %s (out of %d):\n" % (failed_tests_info.error_cnt, error_or_errors, test_cnt) msg += failure_report # If no tests are supposed to fail or some failed for which we were not able to count errors fail now diff --git a/test/easyblocks/easyblock_specific.py b/test/easyblocks/easyblock_specific.py index 4e4d10b4d25..bfbbe1aad94 100644 --- a/test/easyblocks/easyblock_specific.py +++ b/test/easyblocks/easyblock_specific.py @@ -40,6 +40,7 @@ from easybuild.base.testing import TestCase from easybuild.easyblocks.generic.cmakemake import det_cmake_version from easybuild.easyblocks.generic.toolchain import Toolchain +from easybuild.easyblocks import pytorch from easybuild.framework.easyblock import EasyBlock, get_easyblock_instance from easybuild.framework.easyconfig.easyconfig import process_easyconfig from easybuild.tools import config @@ -51,6 +52,97 @@ from easybuild.tools.options import set_tmpdir from easybuild.tools.py2vs3 import StringIO +PYTORCH_TESTS_OUTPUT = """ +... +AssertionError: Expected zero exit code but got -6 for pid: 2006681 + +---------------------------------------------------------------------- +Ran 2 tests in 6.576s + +FAILED (failures=2) +distributed/fsdp/test_fsdp_input failed! +Running distributed/fsdp/test_fsdp_multiple_forward ... [2023-01-12 05:46:45.746098] + +RuntimeError: Process 0 terminated or timed out after 610.0615825653076 seconds + +---------------------------------------------------------------------- +Ran 1 test in 610.744s + +FAILED (errors=1) +Test exited with non-zero exitcode 1. Command to reproduce: /software/Python/3.9.6-GCCcore-11.2.0/bin/python distributed/test_c10d_gloo.py -v DistributedDataParallelTest.test_ddp_comm_hook_register_just_once + +RuntimeError: Process 0 terminated or timed out after 610.0726096630096 seconds + +---------------------------------------------------------------------- +Ran 1 test in 610.729s + +FAILED (errors=1) +Test exited with non-zero exitcode 1. Command to reproduce: /software/Python/3.9.6-GCCcore-11.2.0/bin/python distributed/test_c10d_gloo.py -v DistributedDataParallelTest.test_ddp_invalid_comm_hook_init +test_ddp_invalid_comm_hook_return_type (__main__.DistributedDataParallelTest) + +AssertionError: 4 unit test(s) failed: + DistributedDataParallelTest.test_ddp_comm_hook_register_just_once + DistributedDataParallelTest.test_ddp_invalid_comm_hook_init + ProcessGroupGlooTest.test_round_robin + ProcessGroupGlooTest.test_round_robin_create_destroy +distributed/test_c10d_gloo failed! +Running distributed/test_c10d_nccl ... [2023-01-12 07:43:41.085197] + +ValueError: For each axis slice, the sum of the observed frequencies must agree with the sum of the expected frequencies to a relative tolerance of 1e-08, but the percent differences are: +4.535600093557479e-05 + +---------------------------------------------------------------------- +Ran 216 tests in 22.396s + +FAILED (errors=4) +distributions/test_distributions failed! + +Running test_autograd ... [2023-01-13 04:19:25.587981] +Executing ['/software/Python/3.9.6-GCCcore-11.2.0/bin/python', 'test_autograd.py', '-v'] ... [2023-01-13 04:19:25.588074] +... +test_autograd_views_codegen (__main__.TestAutograd) ... ok +... +====================================================================== +FAIL: test_thread_shutdown (__main__.TestAutograd) +---------------------------------------------------------------------- +Traceback (most recent call last): + File "/tmp/vsc40023/easybuild_build/PyTorch/1.11.0/foss-2021b/pytorch-v1.11.0/test/test_autograd.py", line 4220, in test_thread_shutdown + self.assertRegex(s, "PYTORCH_API_USAGE torch.autograd.thread_shutdown") +AssertionError: Regex didn't match: 'PYTORCH_API_USAGE torch.autograd.thread_shutdown' not found in 'PYTORCH_API_USAGE torch.python.import\nPYTORCH_API_USAGE c10d.python.import\nPYTORCH_API_USAGE tensor.create\n' +---------------------------------------------------------------------- +Ran 464 tests in 18.443s + +FAILED (failures=1, skipped=52, expected failures=1) +test_autograd failed! +Running test_binary_ufuncs ... [2023-01-12 09:02:45.049490] +... + +Running test_jit_cuda_fuser ... [2023-01-12 04:04:08.949222] +Executing ['/software/Python/3.9.6-GCCcore-11.2.0/bin/python', 'test_jit_cuda_fuser.py', '-v'] ... [2023-01-12 04:04:08.949319] +CUDA not available, skipping tests +monkeytype is not installed. Skipping tests for Profile-Directed Typing +Traceback (most recent call last): + File "/tmp/easybuild_build/PyTorch/1.11.0/foss-2021b/pytorch-v1.11.0/test/test_jit_cuda_fuser.py", line 25, in + CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')) +AttributeError: 'NoneType' object has no attribute 'split' +test_jit_cuda_fuser failed! +... +Running distributions/test_constraints ... [2023-01-12 09:05:15.013470] +SKIPPED [2] distributions/test_constraints.py:83: `biject_to` not implemented. +FAILED distributions/test_constraints.py::test_constraint[True-constraint_fn5-False-value5] +FAILED distributions/test_constraints.py::test_constraint[True-constraint_fn7-True-value7] +============= 2 failed, 128 passed, 2 skipped, 2 warnings in 8.66s ============= +distributions/test_constraints failed! + +Running distributions/rpc/test_tensorpipe_agent ... [2023-01-12 09:06:37.093571] +... +Ran 123 tests in 7.549s + +FAILED (errors=2, skipped=2) +... +test_fx failed! Received signal: SIGSEGV +""" # noqa + class EasyBlockSpecificTest(TestCase): """ Baseclass for easyblock testcases """ @@ -265,6 +357,39 @@ def test_det_cmake_version(self): """)) self.assertEqual(det_cmake_version(), '1.2.3-rc4') + def test_pytorch_extract_failed_tests_info(self): + """ + Test extract_failed_tests_info function from PyTorch easyblock. + """ + res = pytorch.extract_failed_tests_info(PYTORCH_TESTS_OUTPUT) + self.assertEqual(len(res), 4) + + expected_failure_report = '\n'.join([ + "distributed/fsdp/test_fsdp_input (2 total tests, failures=2)", + "distributions/test_distributions (216 total tests, errors=4)", + "test_autograd (464 total tests, failures=1, skipped=52, expected failures=1)", + "test_fx (123 total tests, errors=2, skipped=2)", + "distributions/test_constraints 2 failed, 128 passed, 2 skipped, 2 warnings", + "distributed/test_c10d_gloo (4 failed tests)", + "test_jit_cuda_fuser (unknown failed test count)", + ]) + self.assertEqual(res.failure_report.strip(), expected_failure_report) + # test failures + self.assertEqual(res.failure_cnt, 10) + # test errors + self.assertEqual(res.error_cnt, 6) + + expected_failed_test_suites = [ + 'distributed/fsdp/test_fsdp_input', + 'distributions/test_distributions', + 'test_autograd', + 'test_fx', + 'distributions/test_constraints', + 'distributed/test_c10d_gloo', + 'test_jit_cuda_fuser', + ] + self.assertEqual(res.failed_test_suites, expected_failed_test_suites) + def suite(): """Return all easyblock-specific tests."""