From 8aba07ea1f96d1dd6613f2d8de029d18e4d5cb7d Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Fri, 29 May 2026 00:34:59 +0000
Subject: [PATCH] test(scattermoe-lora): skip on CUDA OOM under xdist
 contention

When the suite runs under pytest-xdist, multiple workers race for the same
physical GPU's memory budget. A test that fits comfortably in isolation
can OOM purely because peer workers are already holding most of VRAM
(observed: 8 workers each holding ~44 GiB on a 44 GiB card).

Add a conftest in tests/integrations/kernels/scattermoe_lora/ that hooks
pytest_runtest_call and converts torch.OutOfMemoryError into a skip. Real
correctness bugs still surface as failures since they raise asserts /
typed exceptions, not OOM.

Uses a hookwrapper rather than an autouse fixture because pytest captures
the test exception before re-entering the fixture's generator, so the
fixture's try/except around yield never sees it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../kernels/scattermoe_lora/conftest.py       | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 tests/integrations/kernels/scattermoe_lora/conftest.py

diff --git a/tests/integrations/kernels/scattermoe_lora/conftest.py b/tests/integrations/kernels/scattermoe_lora/conftest.py
new file mode 100644
index 0000000000..90e18e6a93
--- /dev/null
+++ b/tests/integrations/kernels/scattermoe_lora/conftest.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Treat CUDA OOM as a skip for tests in this directory.
+
+When the suite runs under ``pytest-xdist``, multiple workers contend for the
+same physical GPU's memory budget. A test that fits comfortably in isolation
+can OOM purely because peer workers are already holding most of VRAM. That's
+an environmental race, not a code defect, so converting it to a skip keeps
+mixed-GPU CI green without masking real regressions (a real correctness bug
+surfaces as an assert/exception, not as ``torch.OutOfMemoryError``).
+
+We hook ``pytest_runtest_call`` rather than using an autouse fixture because
+pytest captures the test exception before re-entering the fixture's
+generator — the fixture's ``try/except`` around ``yield`` never sees it.
+"""
+
+from __future__ import annotations
+
+import gc
+
+import pytest
+import torch
+
+
+def _cuda_oom_types() -> tuple[type[BaseException], ...]:
+    types: list[type[BaseException]] = []
+    if hasattr(torch, "OutOfMemoryError"):
+        types.append(torch.OutOfMemoryError)
+    cuda_oom = getattr(torch.cuda, "OutOfMemoryError", None)
+    if cuda_oom is not None and cuda_oom not in types:
+        types.append(cuda_oom)
+    return tuple(types) or (RuntimeError,)
+
+
+_OOM = _cuda_oom_types()
+
+
+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_call(item):
+    outcome = yield
+    excinfo = outcome.excinfo
+    if excinfo is None:
+        return
+    exc_val = excinfo[1]
+    if isinstance(exc_val, _OOM):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        outcome.force_exception(
+            pytest.skip.Exception(
+                f"skipping on CUDA OOM (likely xdist worker contention): {exc_val}",
+                _use_item_location=True,
+            )
+        )