[UnitTest][NVPTX] Avoid cascading failures from CUDA postproc

Lunderberg · Lunderberg · commit 22d8a8dca33a · 2023-07-04T09:33:53.000-05:00
Prior to this commit, the tests in `test_tir_transform_inject_ptx_async_copy.py` registered the `"tvm_callback_cuda_postproc"` function during pytest collection, and used a global variable to disable its functionality outside of the tests in this file. This had two major issues. First, if any other test also installs a postproc function, these postproc function required by the NVPTX tests would be overwritten. Second, if one of the NTPTX tests fails, the global variable controlling the postproc function would not be reset, causing any subsequent CUDA-related tests to also fail. This commit updates these NVPTX tests to conditionally install the postproc function, to de-register it after the test instead of disabling its functionality, and to de-register it regardless of the test result. This issue was initially found when debugging apache#15103, when a failure in `test_tir_transform_inject_ptx_async_copy.py::test_cp_async_in_if_then_else` caused failures in 32 unrelated tests ([CI link](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm-gpu/detail/PR-15103/7/tests)).
diff --git a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
@@ -14,11 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import numpy as np
+
 import tvm
 import tvm.testing
 from tvm.script import tir as T
 
+import pytest
+import numpy as np
+
 
 def count_cp_async(stmt):
     num_alloc = [0]
@@ -351,36 +354,38 @@ def test_inject_async_copy_shared_dyn():
 """
 
 
-generated_code = ""
-support_async = True
+@pytest.fixture
+def postproc_if_missing_async_support():
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+    support_async = major >= 8
 
+    func_name = "tvm_callback_cuda_postproc"
+    prev_postproc = None
 
-@tvm.register_func
-def tvm_callback_cuda_postproc(code, _):
-    global generated_code
-    global support_async
-    generated_code = code
-    # return a dummy code so that device < sm80 could build correctly
     if not support_async:
-        ret = ""
-        for line in code.split("\n"):
-            ret += line + "\n"
-            if line.startswith('extern "C" __global__'):
-                break
-        ret += "}"
-        return ret
-    return code
+        prev_postproc = tvm.get_global_func(func_name, allow_missing=True)
 
+        @tvm.register_func(func_name, override=True)
+        def tvm_callback_cuda_postproc(code, _):
+            ret = []
+            for line in code.split("\n"):
+                ret.append(line)
+                ret.append("\n")
+                if line.startswith('extern "C" __global__') and line.endswith("{"):
+                    break
+            ret.append("}")
+            return "".join(ret)
 
-@tvm.testing.requires_cuda
-def test_cp_async_in_if_then_else():
-    global support_async
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # At least sm80 is required
-        support_async = False
+    yield
+
+    # Restore previous postproc func to avoid impacting other tests
+    if prev_postproc is not None:
+        tvm.register_func(func_name, prev_postproc, override=True)
 
+
+@tvm.testing.requires_cuda
+def test_cp_async_in_if_then_else(postproc_if_missing_async_support):
     @T.prim_func
     def simple_compute(
         A: T.Buffer((16, 14), "float32"),
@@ -421,23 +426,13 @@ def simple_compute(
 
     mod = tvm.IRModule.from_expr(simple_compute)
     with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
-        tvm.build(mod, target="cuda")
+        built = tvm.build(mod, target="cuda")
+    generated_code = built.imported_modules[0].get_source()
     assert generated_code == expected_cuda_script
 
-    if not support_async:
-        # avoid return dummy code to other tests
-        support_async = True
-
 
 @tvm.testing.requires_cuda
-def test_vectorize_cp_async_in_if_then_else():
-    global support_async
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # At least sm80 is required
-        support_async = False
-
+def test_vectorize_cp_async_in_if_then_else(postproc_if_missing_async_support):
     @T.prim_func
     def complex_compute(
         A: T.Buffer((2, 16, 16, 1280), "float16"),
@@ -886,17 +881,11 @@ def complex_compute(
 
     mod = tvm.IRModule.from_expr(complex_compute)
     with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
-        tvm.build(mod, target="cuda")
+        built = tvm.build(mod, target="cuda")
+    generated_code = built.imported_modules[0].get_source()
     # generated_code must contain "  setp.ne.b32 p, %0, 0;"
     assert "setp.ne.b32" in generated_code
 
-    if not support_async:
-        # avoid return dummy code to other tests
-        support_async = True
-
 
 if __name__ == "__main__":
-    test_inject_async_copy()
-    test_inject_async_copy_shared_dyn()
-    test_cp_async_in_if_then_else()
-    test_vectorize_cp_async_in_if_then_else()
+    tvm.testing.main()