From b062a048ceadcd3c8073ebc17a6af51009dea437 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Tue, 16 Jun 2026 09:29:28 +0700 Subject: [PATCH 1/2] fix: fail early for CI if meet CUDA error --- tests/conftest.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 16a01f8aa3..ff8cf54fd4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -53,6 +53,35 @@ def _is_torch_fx_available(): ) +# A device-side assert / illegal access poisons the process-wide CUDA context, so +# every later GPU test errors at setup. Abort the session instead of cascading. +_CUDA_FATAL_MARKERS = ( + "device-side assert triggered", + "an illegal memory access was encountered", + "misaligned address", +) +_cuda_context_poisoned = False + + +@pytest.hookimpl(hookwrapper=True) +def pytest_runtest_makereport(item, call): # pylint: disable=unused-argument + outcome = yield + report = outcome.get_result() + global _cuda_context_poisoned # pylint: disable=global-statement + if report.failed and call.excinfo is not None: + if any(marker in str(call.excinfo.value) for marker in _CUDA_FATAL_MARKERS): + _cuda_context_poisoned = True + + +def pytest_runtest_setup(item): # pylint: disable=unused-argument + if _cuda_context_poisoned: + pytest.exit( + "CUDA context corrupted by an earlier test (device-side assert); " + "aborting to avoid cascading setup errors. Re-run the job.", + returncode=1, + ) + + def retry_on_request_exceptions(max_retries=3, delay=1): def decorator(func): @functools.wraps(func) From eac34883c8fa3640515c7d4c179f9d76d8c2577a Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Tue, 16 Jun 2026 13:52:28 +0700 Subject: [PATCH 2/2] fix: switch to clean abort --- tests/conftest.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index ff8cf54fd4..5e3a9bd02c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -73,13 +73,13 @@ def pytest_runtest_makereport(item, call): # pylint: disable=unused-argument _cuda_context_poisoned = True -def pytest_runtest_setup(item): # pylint: disable=unused-argument +def pytest_runtest_setup(item): if _cuda_context_poisoned: - pytest.exit( - "CUDA context corrupted by an earlier test (device-side assert); " - "aborting to avoid cascading setup errors. Re-run the job.", - returncode=1, + item.session.shouldstop = ( + "CUDA context corrupted by an earlier test; aborting to avoid " + "cascading setup errors. Re-run the job." ) + pytest.skip("CUDA context corrupted by an earlier test; aborting suite.") def retry_on_request_exceptions(max_retries=3, delay=1):