From 576edac31fa91a06199d51cc02e167be61c872ad Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 18 Dec 2025 18:03:41 +0100
Subject: [PATCH 01/30] adding easyconfigs:
 parameterized-0.9.0-GCCcore-14.3.0.eb,
 pytest-subtests-0.15.0-GCCcore-14.3.0.eb,
 PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb,
 unittest-xml-reporting-3.2.0-GCCcore-14.3.0.eb and patches:
 PyTorch-1.12.1_add-hypothesis-suppression.patch,
 PyTorch-1.7.0_disable-dev-shm-test.patch,
 PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch,
 PyTorch-2.1.0_remove-test-requiring-online-access.patch,
 PyTorch-2.6.0_show-test-duration.patch,
 PyTorch-2.6.0_skip-test_segfault.patch,
 PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch,
 PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch,
 PyTorch-2.7.1_skip-test_data_parallel_rnn.patch,
 PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch,
 PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch,
 PyTorch-2.7.1_skip-tests-requiring-SM90.patch,
 PyTorch-2.7.1_suport-64bit-BARs.patch,
 PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch,
 PyTorch-2.9.0_disable-test_nan_assert.patch,
 PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch,
 PyTorch-2.9.0_fix-attention-squeeze.patch,
 PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch,
 PyTorch-2.9.0_fix-nccl-test-env.patch,
 PyTorch-2.9.0_fix-test_exclude_padding.patch,
 PyTorch-2.9.0_fix-test_version_error.patch,
 PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch,
 PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch,
 PyTorch-2.9.0_remove-faulty-close.patch,
 PyTorch-2.9.0_revert-pybind11-3-change.patch,
 PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch,
 PyTorch-2.9.0_skip-test_convolution1-on-H100.patch,
 PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch,
 PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch,
 PyTorch-2.9.0_skip-test_override-without-CUDA.patch,
 PyTorch-2.9.0_skip-test_unbacked_reduction.patch,
 PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch,
 PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch,
 PyTorch-2.9.1_skip-RingFlexAttentionTest.patch

---
 ...orch-2.7.0_avoid_caffe2_test_cpp_jit.patch |  14 ++
 ...7.1_avoid-caffe2-sandcastle-test-lib.patch |  18 ++
 ...ch-2.7.1_skip-test_data_parallel_rnn.patch |  28 +++
 ...orch-2.7.1_skip-test_gds_fails_in_ci.patch |  16 ++
 ...skip-test_mixed_mm_exhaustive_dtypes.patch |  14 ++
 ...orch-2.7.1_skip-tests-requiring-SM90.patch |  34 +++
 .../PyTorch-2.7.1_suport-64bit-BARs.patch     |  27 +++
 ..._tolerance-test_partial_flat_weights.patch |  23 ++
 ...yTorch-2.9.0_disable-test_nan_assert.patch |  57 +++++
 ...r-in-test_workspace_allocation_error.patch |  28 +++
 ...U-tests-in-test_torchinductor_opinfo.patch |  28 +++
 .../PyTorch-2.9.0_fix-attention-squeeze.patch |  59 +++++
 .../PyTorch-2.9.0_fix-nccl-test-env.patch     |  55 +++++
 ...Torch-2.9.0_fix-test_exclude_padding.patch |  33 +++
 ...PyTorch-2.9.0_fix-test_version_error.patch |  27 +++
 .../PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch  |  29 +++
 ...rease-tolerance-in-test_transformers.patch |  21 ++
 .../PyTorch-2.9.0_remove-faulty-close.patch   |  48 ++++
 ...Torch-2.9.0_revert-pybind11-3-change.patch |  68 ++++++
 ...ip-test_benchmark_on_non_zero_device.patch |  23 ++
 ...2.9.0_skip-test_convolution1-on-H100.patch |  30 +++
 ...tor_all_gather_into_tensor_coalesced.patch |  19 ++
 ...-test_original_aten_preserved_pad_mm.patch |  19 ++
 ....9.0_skip-test_override-without-CUDA.patch |  35 +++
 ...h-2.9.0_skip-test_unbacked_reduction.patch |  18 ++
 ...2.9.0_skip-tests-requiring-CUDA-12.8.patch | 122 ++++++++++
 ...expected-success-in-test_fake_export.patch | 104 +++++++++
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   | 220 ++++++++++++++++++
 ...rch-2.9.1_skip-RingFlexAttentionTest.patch |  23 ++
 .../parameterized-0.9.0-GCCcore-14.3.0.eb     |  18 ++
 .../pytest-subtests-0.15.0-GCCcore-14.3.0.eb  |  22 ++
 ...test-xml-reporting-3.2.0-GCCcore-14.3.0.eb |  23 ++
 32 files changed, 1303 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_data_parallel_rnn.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_remove-faulty-close.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_revert-pybind11-3-change.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_convolution1-on-H100.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_override-without-CUDA.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_unbacked_reduction.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-RingFlexAttentionTest.patch
 create mode 100644 easybuild/easyconfigs/p/parameterized/parameterized-0.9.0-GCCcore-14.3.0.eb
 create mode 100644 easybuild/easyconfigs/p/pytest-subtests/pytest-subtests-0.15.0-GCCcore-14.3.0.eb
 create mode 100644 easybuild/easyconfigs/u/unittest-xml-reporting/unittest-xml-reporting-3.2.0-GCCcore-14.3.0.eb

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch
new file mode 100644
index 000000000000..f07706b8d371
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch
@@ -0,0 +1,14 @@
+Avoid tripping on //caffe2/test/cpp/jit:test_custom_class_registrations with IS_SANDCASTLE
+
+Author: Alexander Grund (TU Dresden)
+--- a/torch/testing/_internal/torchbind_impls.py
++++ b/torch/testing/_internal/torchbind_impls.py
+@@ -116,8 +116,6 @@ def load_torchbind_test_lib():
+ 
+     if IS_MACOS:
+         raise unittest.SkipTest("non-portable load_library call used in test")
+-    elif IS_SANDCASTLE or IS_FBCODE:
+-        lib_file_path = Path("//caffe2/test/cpp/jit:test_custom_class_registrations")
+     elif IS_WINDOWS:
+         lib_file_path = find_library_location("torchbind_test.dll")
+     else:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch
new file mode 100644
index 000000000000..bb3103160a73
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch
@@ -0,0 +1,18 @@
+"//caffe2/test/inductor:custom_ops" is a FB-specific "library" which we pull in by setting IS_SANDCASTLE causing
+> OSError: /caffe2/test/inductor:custom_ops: cannot open shared object file: No such file or directory
+in inductor/test_aot_inductor_custom_ops.py
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/inductor/test_aot_inductor_custom_ops.py b/test/inductor/test_aot_inductor_custom_ops.py
+index ce2ef3739d3..7b9dc4792fd 100644
+--- a/test/inductor/test_aot_inductor_custom_ops.py
++++ b/test/inductor/test_aot_inductor_custom_ops.py
+@@ -380,7 +380,7 @@ common_utils.instantiate_parametrized_tests(AOTInductorTestsTemplate)
+ 
+ class AOTICustomOpTestCase(TestCase):
+     def setUp(self):
+-        if IS_SANDCASTLE or IS_FBCODE:
++        if False:
+             torch.ops.load_library("//caffe2/test/inductor:custom_ops")
+         elif IS_MACOS:
+             raise unittest.SkipTest("non-portable load_library call used in test")
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_data_parallel_rnn.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_data_parallel_rnn.patch
new file mode 100644
index 000000000000..5b81095e9317
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_data_parallel_rnn.patch
@@ -0,0 +1,28 @@
+Failing upstream too: https://github.com/pytorch/pytorch/issues/162745
+> /PyTorch/2.7.1/foss-2024a-CUDA-12.6.0/pytorch-v2.7.1/test/distributed/test_data_parallel.py", line 99, in test_data_parallel_rnn
+>     self.assertTrue(p1.allclose(p2))
+> AssertionError: False is not true
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py
+index 26f64df90d9..c25cc6673c3 100644
+--- a/test/distributed/test_data_parallel.py
++++ b/test/distributed/test_data_parallel.py
+@@ -6,6 +6,7 @@ import io
+ from collections import OrderedDict
+ from copy import deepcopy
+ from itertools import product
++import unittest
+ 
+ import torch
+ import torch.nn.functional as F
+@@ -63,7 +64,7 @@ class TestDataParallel(TestCase):
+ 
+         gradcheck(fn, (m.t_rg,))
+ 
+-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
++    @unittest.skip("Fails")
+     def test_data_parallel_rnn(self):
+         class TestModule(torch.nn.Module):
+             def __init__(self) -> None:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch
new file mode 100644
index 000000000000..bb10b1044562
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch
@@ -0,0 +1,16 @@
+Skip a test meant for CI only.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_cuda.py b/test/test_cuda.py
+index 3726c377970..78b5e8c8af9 100644
+--- a/test/test_cuda.py
++++ b/test/test_cuda.py
+@@ -3633,6 +3633,7 @@ print(f"{{r1}}, {{r2}}")
+         x = torch.cuda.device_count()
+         self.assertEqual(f"{x}, 1", r)
+ 
++    @unittest.skip("Not applicable")
+     def test_gds_fails_in_ci(self):
+         if IS_WINDOWS or TEST_WITH_ROCM:
+             error_msg = "is not supported on this platform"
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch
new file mode 100644
index 000000000000..e745a7282085
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch
@@ -0,0 +1,14 @@
+Test fails upstream too, see https://github.com/pytorch/pytorch/issues/147853
+> RuntimeError: Expected to find ".to(" but did not find it
+
+Author: Alexander Grund (TU Dresden)
+--- a/test/inductor/test_pattern_matcher.py
++++ b/test/inductor/test_pattern_matcher.py
+@@ -389,6 +389,7 @@ class TestPatternMatcher(TestCase):
+         }
+     )
+     @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
++    @unittest.skip("Fails")
+     def test_mixed_mm_exhaustive_dtypes(self):
+         def fn(a, b):
+             return torch.mm(a, b.to(a.dtype))
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch
new file mode 100644
index 000000000000..ee60c76ddbcf
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch
@@ -0,0 +1,34 @@
+Avoid it failing on e.g. A100:
+
+> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721] RuntimeError: CUDA error: device-side assert triggered...
+> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721]  exiting process 1 with exit code: 10
+> ...
+> :318: st_vec: block: [0,0,0], thread: [87,0,0] Assertion `false` failed.
+> /pytorch-v2.7.1/torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h:318: st_vec: block: [0,0,0], thread: [88,0,0] Assertion `false` failed.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index 7410255d27a..603ea0b375b 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -3367,7 +3367,7 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
+     @skip_if_rocm_multiprocess
+     def test_intra_node_comm_all_reduce(self):
+         from torch._C._distributed_c10d import _get_intra_node_comm_usage_counter
+-        from torch.testing._internal.common_cuda import SM80OrLater
++        from torch.testing._internal.common_cuda import SM90OrLater
+ 
+         for peer in range(self.world_size):
+             if peer == self.rank:
+@@ -3375,8 +3375,8 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
+             if not torch._C._cuda_canDeviceAccessPeer(self.rank, peer):
+                 raise SkipTest("Test requires p2p access")
+ 
+-        if not SM80OrLater:
+-            raise SkipTest("Test requires sm>=80")
++        if not SM90OrLater:
++            raise SkipTest("Test requires sm>=90")
+ 
+         store = c10d.FileStore(self.file_name, self.world_size)
+         os.environ["ENABLE_INTRA_NODE_COMM"] = "1"
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch
new file mode 100644
index 000000000000..6e8cdfb2d36a
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch
@@ -0,0 +1,27 @@
+When the GPUs use 64bit BARs the RPC module fails during the initialization with:
+> E           RuntimeError: In getBar1SizeOfGpu at tensorpipe/channel/cuda_gdr/context_impl.cc:242 "": No such file or directory
+
+This causes KeyboardInterrupt errors in distributed/rpc/test_share_memory
+
+See https://github.com/pytorch/pytorch/issues/159354
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc b/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc
+index 182a04a..b26751e 100644
+--- a/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc
++++ b/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc
+@@ -239,6 +239,13 @@ size_t getBar1SizeOfGpu(int gpuIdx) {
+ 
+   struct stat bar1Stats;
+   int rv = ::stat(pciPath.c_str(), &bar1Stats);
++  if (rv < 0 && errno == ENOENT) {
++    // Some GPUs use 64 bit BARs using 2 slots each,
++    // so the BAR 0 spans slots 0 & 1 and BAR 1 is at slots 2 & 3
++    TP_VLOG(5) << "GPU #" << gpuIdx << " might has 64 bit BARs";
++    pciPath[pciPath.size() - 1] = '2';
++    rv = ::stat(pciPath.c_str(), &bar1Stats);
++  }
+   TP_THROW_SYSTEM_IF(rv < 0, errno);
+ 
+   return bar1Stats.st_size;
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch
new file mode 100644
index 000000000000..c58d35aacafd
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch
@@ -0,0 +1,23 @@
+Avoid failures in test_nn.py test_partial_flat_weights
+
+> Mismatched elements: 9 / 36 (25.0%)
+> Greatest absolute difference: 3.013014793395996e-05 at index (2, 0, 4) (up to 1e-05 allowed)
+> Greatest relative difference: 0.0030790010932832956 at index (2, 0, 4) (up to 1.3e-06 allowed)
+
+See https://github.com/pytorch/pytorch/issues/163072
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_nn.py b/test/test_nn.py
+index 30609247cb1..02a2d3a7f3a 100644
+--- a/test/test_nn.py
++++ b/test/test_nn.py
+@@ -4299,7 +4299,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
+         inp = inp.cuda()
+         # otherwise, subsequent warnings will be hidden, and further tests rely on them
+         warnings.simplefilter("always")
+-        self.assertEqual(m(inp)[0].cpu(), out_expected[0])
++        self.assertEqual(m(inp)[0].cpu(), out_expected[0], atol=3.1e-5, rtol=3.1e-3)
+ 
+     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+     @set_default_dtype(torch.double)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch
new file mode 100644
index 000000000000..0f60a483e5aa
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch
@@ -0,0 +1,57 @@
+Disable a test that has incomplete skip condition.
+See https://github.com/pytorch/pytorch/pull/167971
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index 0a0f3ee4ca2..aff8ba0156f 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -11,6 +11,7 @@ import sys
+ import tempfile
+ import threading
+ import time
++import unittest
+ import warnings
+ from contextlib import contextmanager
+ from datetime import datetime, timedelta
+@@ -295,12 +296,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
+         # But if we are in Sandcastle, `skip_but_pass_in_sandcastle` would return 0.
+         TEST_NAN_ASSERT_RETURN = 0 if IS_SANDCASTLE else signal.SIGABRT
+         self.special_return_code_checks = {
+-            self.test_nan_assert_float16.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float32.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float64.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_bfloat16.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float8_e4m3fn.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float8_e5m2.__wrapped__: TEST_NAN_ASSERT_RETURN,
++
+         }
+ 
+         # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
+@@ -489,24 +485,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
+         torch.version.cuda is not None and int(torch.version.cuda.split(".")[0]) >= 12
+     )
+ 
+-    @requires_nccl()
+-    @skip_but_pass_in_sandcastle_if(
+-        # skip for cu126 as well due to https://github.com/pytorch/pytorch/issues/153479
+-        not (TEST_MULTIGPU and CUDA_12_AND_ABOVE),
+-        "NCCL test requires 2+ GPUs and Device side assert could cause unexpected errors in lower versions of CUDA",
+-    )
+-    @parametrize(
+-        "type",
+-        [
+-            torch.float16,
+-            torch.float32,
+-            torch.float64,
+-            torch.bfloat16,
+-            torch.float8_e4m3fn,
+-            torch.float8_e5m2,
+-        ],
+-    )
+-    @skip_if_rocm_multiprocess
++    @unittest.skip("Wrong conditions")
+     def test_nan_assert(self, type):
+         # Expecting a device-side error when NaN is detected
+         os.environ["TORCH_NCCL_NAN_CHECK"] = "1"
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch
new file mode 100644
index 000000000000..5c35b586ac8b
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch
@@ -0,0 +1,28 @@
+CudaGraphTreeTests.test_workspace_allocation_error fails if TORCH_DISABLE_ADDR2LINE=1 is set
+> File "/pytorch-v2.9.0/test/inductor/test_cudagraph_trees.py", line 1568, in test_workspace_allocation_error
+>     self.assertTrue(
+> AssertionError: False is not true
+
+See https://github.com/pytorch/pytorch/issues/103369
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
+--- a/test/inductor/test_cudagraph_trees.py
++++ b/test/inductor/test_cudagraph_trees.py
+@@ -5,6 +5,7 @@ import functools
+ import gc
+ import importlib
+ import itertools
++import os
+ import re
+ import sys
+ import unittest
+@@ -1543,6 +1544,7 @@ if HAS_CUDA_AND_TRITON:
+         @skipIfRocm
+         @unittest.skipUnless(IS_X86 and IS_LINUX, "cpp contexts are linux only")
+         @torch._inductor.config.patch("triton.cudagraph_trees_history_recording", True)
++        @unittest.mock.patch.dict(os.environ, {"TORCH_DISABLE_ADDR2LINE": "0"})
+         def test_workspace_allocation_error(self):
+             torch._C._cuda_clearCublasWorkspaces()
+ 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch
new file mode 100644
index 000000000000..0bf2d29a7459
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch
@@ -0,0 +1,28 @@
+Many tests using Float16 on CPU fail with reference_in_float=False
+See https://github.com/pytorch/pytorch/issues/169809
+
+E.g.:
+> TestInductorOpInfoCPU.test_comprehensive_grid_sampler_2d_cpu_float16
+> [...]
+> Mismatched elements: 125 / 780 (16.0%)
+> Greatest absolute difference: 0.02001953125 at index (0, 1, 3, 2) (up to 1e-05 allowed)
+> Greatest relative difference: 2.34375 at index (1, 1, 2, 4) (up to 0.001 allowed)
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
+index 807ccb48a79..7e5740e0177 100644
+--- a/test/inductor/test_torchinductor_opinfo.py
++++ b/test/inductor/test_torchinductor_opinfo.py
+@@ -1329,8 +1329,10 @@ class TestInductorOpInfo(TestCase):
+                         # Triton
+                         if has_triton():
+                             adjusted_kwargs.update(
+-                                copy_to_gpu=False, reference_in_float=False
++                                copy_to_gpu=False,
+                             )
++                            if device_type == GPU_TYPE:
++                                adjusted_kwargs['reference_in_float'] = False
+ 
+                         # skip checking gradient on CPU for now
+                         if device_type == GPU_TYPE:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch
new file mode 100644
index 000000000000..851ac1f34bd5
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch
@@ -0,0 +1,59 @@
+From d55c9d52cda889850484968fc55ee73bf40540ec Mon Sep 17 00:00:00 2001
+From: Chien-Chin Huang <chienchin@fb.com>
+Date: Wed, 17 Sep 2025 18:14:51 -0700
+Subject: [PATCH] [CP] Fix cuDNN CP LSE dimension bug (#163231)
+
+We should only unsqueeze if necessary.
+
+Fix https://github.com/pytorch/pytorch/issues/162743
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/163231
+Approved by: https://github.com/eqy
+ghstack dependencies: #162539, #162540, #162541, #163115, #163131
+---
+ .../tensor/experimental/_attention.py          | 18 +++++++++++++++---
+ 1 file changed, 15 insertions(+), 3 deletions(-)
+
+diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
+index 6336967582429..a3345f37a170d 100644
+--- a/torch/distributed/tensor/experimental/_attention.py
++++ b/torch/distributed/tensor/experimental/_attention.py
+@@ -134,6 +134,7 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
+         self._seq_dim = seq_dim
+         self._out: Optional[torch.Tensor] = None
+         self._lse: Optional[torch.Tensor] = None
++        self._should_lse_squeeze = False
+         self._convert_to_f32 = convert_to_f32
+         self._out_dtype = torch.float32
+         self._lse_dtype = torch.float32
+@@ -141,7 +142,14 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
+     def _merge_one(
+         self, block_out: torch.Tensor, block_lse: torch.Tensor, partial: bool
+     ) -> None:
+-        block_lse = block_lse.unsqueeze(dim=-1)
++        # The cuDNN backend preserves the last dimension for LSE.
++        # Apply unsqueeze only if the input does not already have
++        # the required dimensionality.
++        if len(block_lse.shape) < len(block_out.shape):
++            block_lse = block_lse.unsqueeze(dim=-1)
++            self._should_lse_squeeze = True
++        assert len(block_lse.shape) == len(block_out.shape)
++
+         if self._lse is None:
+             self._lse = block_lse
+             self._out = block_out
+@@ -199,8 +207,12 @@ def step(self, out: torch.Tensor, lse: torch.Tensor, partial: bool) -> None:
+     def results(self) -> tuple[torch.Tensor, torch.Tensor]:
+         assert self._out is not None
+         assert self._lse is not None
+-        out, lse = self._out, self._lse.squeeze(-1)
+-        return out.to(self._out_dtype), lse.to(self._lse_dtype)
++        out = self._out.to(self._out_dtype)
++        if self._should_lse_squeeze:
++            lse = self._lse.squeeze(-1).to(self._lse_dtype)
++        else:
++            lse = self._lse.to(self._lse_dtype)
++        return out, lse
+ 
+ 
+ class _AttentionOp(Protocol):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch
new file mode 100644
index 000000000000..248d6d934b7b
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch
@@ -0,0 +1,55 @@
+From 6702f545d880fd82700811e4a3508cdd76da9a69 Mon Sep 17 00:00:00 2001
+From: Alexander Grund <alexander.grund@tu-dresden.de>
+Date: Tue, 16 Sep 2025 17:37:06 +0000
+Subject: [PATCH] Restore environment after NcclUserBufferRegistrationTest
+ (#163063)
+
+This test sets "NCCL_ALGO=NVLS" in NcclUserBufferRegistrationTest which affects tests run in the same process such as `test_on_completion_hook_*` that fail with
+> invalid usage (run with NCCL_DEBUG=WARN for details), NCCL version 2.26.2
+> ncclInvalidUsage: This usually reflects invalid usage of NCCL library.
+> Last error:
+> Error : no algorithm/protocol available for function Broadcast with datatype ncclInt8. NCCL_ALGO was set to NVLS.
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/163063
+Approved by: https://github.com/ezyang
+---
+ test/distributed/test_c10d_nccl.py | 21 +++++++++++++--------
+ 1 file changed, 13 insertions(+), 8 deletions(-)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index 0d55845228da..f44394e3148c 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -3145,19 +3145,24 @@ def test_invalid_nccl_blocking_wait_env(self):
+ class NcclUserBufferRegistrationTest(MultiProcessTestCase):
+     def setUp(self):
+         super().setUp()
+-        # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
+-        # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
+-        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+         nccl_debug_file = tempfile.NamedTemporaryFile()
+-        os.environ["NCCL_ALGO"] = "NVLS"
+-        os.environ["NCCL_DEBUG"] = "INFO"
+-        os.environ["NCCL_DEBUG_SUBSYS"] = "NVLS"
++        nccl_env = {
++            # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
++            # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
++            "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1",
++            "NCCL_ALGO": "NVLS",
++            "NCCL_DEBUG": "INFO",
++            "NCCL_DEBUG_SUBSYS": "NVLS",
++            "NCCL_DEBUG_FILE": nccl_debug_file.name,
++        }
+         if torch.cuda.nccl.version() >= (2, 24, 3):
+-            os.environ["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
+-        os.environ["NCCL_DEBUG_FILE"] = nccl_debug_file.name
++            nccl_env["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
++        self.env_patcher = mock.patch.dict(os.environ, nccl_env)
++        self.env_patcher.start()
+         self._spawn_processes()
+ 
+     def tearDown(self):
++        self.env_patcher.stop()
+         super().tearDown()
+         try:
+             os.remove(self.file_name)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch
new file mode 100644
index 000000000000..b74d565bc51f
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch
@@ -0,0 +1,33 @@
+PadMMTest.test_exclude_padding fails on H100 with
+>     self.assertTrue(len(local_cache) == 2)
+> AssertionError: False is not true
+
+Increasing the size triggers the intended code.
+See https://github.com/pytorch/pytorch/pull/169177
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
+--- a/test/inductor/test_pad_mm.py
++++ b/test/inductor/test_pad_mm.py
+@@ -425,7 +426,10 @@ class PadMMTest(TestCase):
+         def mm(a, b):
+             return a @ b
+ 
+-        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
++        # Size must be big enough such that `is_mm_compute_bound` returns True and we need padding to 4 elements
++        # machine balance is ~8.3 (A100), 14.1 (H100), size must be 3x that, see arithmetic_intensity for M=N=K
++        size = [59, 59]
++        mm(torch.rand(size, device="cuda"), torch.rand(size, device="cuda"))
+         local_cache = get_pad_cache().get_local_cache()
+         self.assertTrue(len(local_cache) == 2)
+         FileCheck().check_count("exclude_pad:False", 2, exactly=True).run(
+@@ -436,7 +440,7 @@ class PadMMTest(TestCase):
+         def mm(a, b):
+             return (a + 1) @ b
+ 
+-        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
++        mm(torch.rand(size, device="cuda"), torch.rand(size, device="cuda"))
+         local_cache = get_pad_cache().get_local_cache()
+         # reuse original base timing
+         self.assertTrue(len(local_cache) == 3)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch
new file mode 100644
index 000000000000..819b85773563
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch
@@ -0,0 +1,27 @@
+TestSaveLoad.test_version_error causes a failure due to TEMPDIR being set by EasyBuild:
+
+> Ran into the following error when deserializing: [enforce fail at inline_container.cc:332] . file in archive is not in a subdirectory tmpi40i4vmn/: easybuild-tmp/archive_version
+
+Fix the code to handle that, see https://github.com/pytorch/pytorch/pull/169936
+
+diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
+index faef9b455a0..e3a463014fb 100644
+--- a/test/export/test_serialize.py
++++ b/test/export/test_serialize.py
+@@ -7,6 +7,7 @@ with test_sym_bool)
+ import copy
+ import io
+ import math
++import os
+ import tempfile
+ import unittest
+ import zipfile
+@@ -1915,7 +1916,7 @@ class TestSaveLoad(TestCase):
+             with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+                 save(ep, f.name)
+                 f.seek(0)
+-                file_prefix = f.name.split("/")[2].split(".")[0]
++                file_prefix = os.path.splitext(os.path.basename(f.name))[0]
+ 
+                 # Create a new file and copy things over, but modify the
+                 # archive version
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch
new file mode 100644
index 000000000000..e2a096dd8b94
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch
@@ -0,0 +1,29 @@
+Avoid PyTorch trying to use $HOME if XDG_CACHE_HOME is set.
+See https://github.com/pytorch/pytorch/pull/168232
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
+--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
++++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
+@@ -36,8 +36,18 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
+   if (writer_ == nullptr) {
+     // Attempt to write to running user's HOME directory cache folder - if it
+     // exists.
+-    auto homeDir = getCvarString({"HOME"}, "/tmp");
+-    auto cacheDirPath = std::filesystem::path(homeDir + "/.cache/torch");
++    #ifdef _WIN32
++        const char* cacheHome = nullptr;
++    #else
++        // Uses XDG_CACHE_HOME if it's set
++        const char* cacheHome = std::getenv("XDG_CACHE_HOME");
++    #endif
++    std::string cacheRoot;
++    if (cacheHome)
++      cacheRoot = cacheHome;
++    else
++      cacheRoot = getCvarString({"HOME"}, "/tmp") + "/.cache";
++    auto cacheDirPath = std::filesystem::path(cacheRoot + "/torch");
+     // Create the .cache directory if it doesn't exist
+     std::filesystem::create_directories(cacheDirPath);
+     auto defaultLocation = cacheDirPath / "comm_lib_trace_rank_";
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch
new file mode 100644
index 000000000000..76180cb44818
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch
@@ -0,0 +1,21 @@
+When not using Intel MKL this shows a tolerance error in
+TestSDPACpuOnlyCPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_17_n_head_1_head_dim_8_mask_dim_2_bool_mask_True_train_True_casual_False_set_attn_mask_True_cpu_float32
+
+>    self.assertEqual(grad_k_actual, grad_k_ref, atol=tol_grad.atol, rtol=tol_grad.rtol)
+> Mismatched elements: 1 / 1632 (0.1%)
+> Greatest absolute difference: 1.245737075805664e-05 at index (9, 0, 15, 1) (up to 1e-05 allowed)
+> Greatest relative difference: 5.157565828994848e-05 at index (9, 0, 15, 1) (up to 5e-06 allowed)
+
+diff --git a/test/test_transformers.py b/test/test_transformers.py
+index 5b240e1f046..2e1b4091d35 100644
+--- a/test/test_transformers.py
++++ b/test/test_transformers.py
+@@ -2153,6 +2153,8 @@ class TestSDPACpuOnly(NNTestCase):
+             tol_grad = Tolerances(5e-2, 5e-2)
+         if dtype is torch.float16:
+             tol_grad = Tolerances(1e-1, 1e-1)
++        if dtype is torch.float32:
++            tol_grad = Tolerances(1.3e-5, 5.2e-5)
+         for mask_shape in itertools.product(
+             [q_seq_len, 1], [kv_seq_len, 1]
+         ) if mask_dim == 2 else itertools.product(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_remove-faulty-close.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_remove-faulty-close.patch
new file mode 100644
index 000000000000..0eeea901157c
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_remove-faulty-close.patch
@@ -0,0 +1,48 @@
+commit d3d62ad44284abff4fcd0c70e245739c976bf5e1
+Author: Alexander Grund <alexander.grund@tu-dresden.de>
+Date:   Tue Nov 25 13:54:26 2025 +0100
+
+    Avoid closing random file handles in Inductor
+    
+    `CppCodeCache.load` returns a `ctypes.CDLL`.
+    That does not have a (Python class) `close` function so calling
+    `self.DLL.close()` calls whatever C function with name `close` happens
+    to exist. This is usually the glibc `close` that closes (file) handles.
+    As the argument is missing it closes whatever happens to be in the
+    register at that point.
+    
+    In some tests this seems to close "fd=1", i.e. stdout. Sebsequent
+    writes/print then fails with
+    > OSError: [Errno 9] Bad file descriptor
+    
+    Simply remove the `close` call for now.
+
+diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
+index 1d1687141fb..66b741fafe2 100644
+--- a/torch/_inductor/autotune_process.py
++++ b/torch/_inductor/autotune_process.py
+@@ -882,14 +882,6 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest):
+             *self.extra_args,
+         )
+ 
+-    def cleanup_run_fn(self) -> None:
+-        if self.DLL is not None:
+-            """
+-            Check close attr due to it crash on Windows.
+-            """
+-            if hasattr(self.DLL, "close"):
+-                self.DLL.close()
+-
+     def __str__(self) -> str:
+         return f"{self.kernel_name=}"
+ 
+@@ -939,9 +931,6 @@ class CuteDSLBenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
+ 
+         return run_kernel
+ 
+-    def cleanup_run_fn(self) -> None:
+-        """Clean up any resources used by the kernel."""
+-
+ 
+ @functools.cache
+ def get_tuning_process_pool() -> TuningProcessPool:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_revert-pybind11-3-change.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_revert-pybind11-3-change.patch
new file mode 100644
index 000000000000..1b831f45fa58
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_revert-pybind11-3-change.patch
@@ -0,0 +1,68 @@
+Revert https://github.com/pytorch/pytorch/pull/161063
+
+The PR introduced changes required for the pybind11 3.x API which makes it incompatible with pybind11 2.x
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_cpu_cpp_wrapper.py b/test/inductor/test_cpu_cpp_wrapper.py
+index 47a8f3aa063..4b4daaef5c4 100644
+--- a/test/inductor/test_cpu_cpp_wrapper.py
++++ b/test/inductor/test_cpu_cpp_wrapper.py
+@@ -268,7 +268,7 @@ if RUN_CPU:
+             "test_multi_threading",
+             condition=not IS_WINDOWS,
+             # Two threads compile, so we expect the output code to be printed twice.
+-            code_string_count={"py::gil_scoped_release_simple release;": 2},
++            code_string_count={"py::gil_scoped_release release;": 2},
+         ),
+         BaseTest("test_profiler_mark_wrapper_call"),
+         BaseTest(
+diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
+index 83d1d061467..77f9c368ed3 100644
+--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
++++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
+@@ -585,7 +585,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
+                     # Weights are promoted in the JIT mode
+                     num_args = len(V.graph.graph_inputs) + len(V.graph.constants)
+                     # release GIL to support multiple instances inference (in different threads of the same process)
+-                    self.prefix.splice("py::gil_scoped_release_simple release;")
++                    self.prefix.splice("py::gil_scoped_release release;")
+ 
+                 self.prefix.splice(
+                     f"""
+@@ -2310,7 +2310,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
+ 
+         scoped_lines.writeline("{")
+         with scoped_lines.indent():
+-            scoped_lines.writeline("py::gil_scoped_acquire_simple acquire;")
++            scoped_lines.writeline("py::gil_scoped_acquire acquire;")
+             scoped_lines.writelines(lines_in_scope.split("\n"))
+         scoped_lines.writelines("}")
+         return scoped_lines._lines
+diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
+index 63c5bc2debe..fd145ece606 100644
+--- a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
++++ b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
+@@ -297,7 +297,7 @@ class CppWrapperCpuArrayRef(CppWrapperCpu):
+                         # Weights are promoted in the JIT mode
+                         num_args = len(V.graph.graph_inputs) + len(V.graph.constants)
+                         # release GIL to support multiple instances inference (in different threads of the same process)
+-                        self.prefix.splice("py::gil_scoped_release_simple release;")
++                        self.prefix.splice("py::gil_scoped_release release;")
+ 
+                     self.prefix.splice(
+                         f"""
+diff --git a/torch/csrc/inductor/cpp_wrapper/common.h b/torch/csrc/inductor/cpp_wrapper/common.h
+index a2eebfcc860..9d9ae16462c 100644
+--- a/torch/csrc/inductor/cpp_wrapper/common.h
++++ b/torch/csrc/inductor/cpp_wrapper/common.h
+@@ -6,7 +6,8 @@
+ #include <utility>
+ 
+ #include <Python.h>
+-#include <pybind11/gil_simple.h>
++#define PYBIND11_SIMPLE_GIL_MANAGEMENT
++#include <pybind11/gil.h>
+ 
+ // Include some often-used cpp_wrapper headers, for precompiling.
+ #include <c10/util/BFloat16.h>
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch
new file mode 100644
index 000000000000..b0a55ad49125
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch
@@ -0,0 +1,23 @@
+inductor/test_benchmark_fusion.py BenchmarkingTest.test_benchmark_on_non_zero_device fails with
+>     self.assertTrue(hit_count > 0)
+> AssertionError: False is not true
+
+Related: https://github.com/pytorch/pytorch/issues/160514
+
+Author: Alexander Grund (TU Dresden)
+
+--- a/test/inductor/test_benchmark_fusion.py
++++ b/test/inductor/test_benchmark_fusion.py
+@@ -206,10 +206,7 @@ if HAS_CUDA_AND_TRITON:
+     copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionCudaTest, "cuda")
+ 
+     class BenchmarkingTest(TestCase):
+-        @unittest.skipIf(
+-            torch.cuda.device_count() < 2, "The test need at least 2 devices"
+-        )
+-        @skip_if_cpp_wrapper("This tests triton scheduling directly")
++        @unittest.skip("Mocking fails")
+         def test_benchmark_on_non_zero_device(self):
+             hit_count = 0
+             with torch.cuda.device("cuda:0"):
+
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_convolution1-on-H100.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_convolution1-on-H100.patch
new file mode 100644
index 000000000000..e0c0a45b3415
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_convolution1-on-H100.patch
@@ -0,0 +1,30 @@
+test_select_algorithm.py TestSelectAlgorithm.test_convolution1 fails on H100 with:
+
+> Mismatched elements: 19584 / 23120 (84.7%)
+> Greatest absolute difference: 132.32015991210938 at index (0, 22, 4, 13) (up to 0.0001 allowed)
+> Greatest relative difference: inf at index (0, 0, 1, 0) (up to 0.0001 allowed)
+
+See https://github.com/pytorch/pytorch/issues/143412
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
+index b30cdc2d946..25d3c068133 100644
+--- a/test/inductor/test_select_algorithm.py
++++ b/test/inductor/test_select_algorithm.py
+@@ -27,6 +27,7 @@ from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm, skipIfXpu
+ from torch.testing._internal.inductor_utils import (
+     GPU_TYPE,
+     HAS_GPU,
++    IS_H100,
+     requires_gpu,
+     requires_triton,
+ )
+@@ -295,6 +296,7 @@ class TestSelectAlgorithm(TestCase):
+         foo(torch.randn(64, 64, device=GPU_TYPE))
+         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+ 
++    @unittest.skipIf(IS_H100, "Fails on H100, see #143412")
+     @expectedFailureDynamicWrapper
+     @patches
+     def test_convolution1(self):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch
new file mode 100644
index 000000000000..fe992ece4f59
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch
@@ -0,0 +1,19 @@
+The test fails with
+> RuntimeError: Expected to find "buf0 = torch.ops._c10d_functional.all_gather_into_tensor_coalesced.default([arg3_1, arg2_1, arg1_1, arg0_1]" but did not find it
+
+Also upstream: https://github.com/pytorch/pytorch/issues/146806
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
+index bafc781b591..60fc47f63e4 100644
+--- a/test/distributed/test_c10d_functional_native.py
++++ b/test/distributed/test_c10d_functional_native.py
+@@ -997,7 +997,7 @@ class CompileTest(TestCase):
+         AOTIRunnerUtil.run(func, (arg,))
+         torch.cuda.synchronize()
+ 
+-    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
++    @unittest.skip("Fails")
+     @fresh_cache()
+     def test_inductor_all_gather_into_tensor_coalesced(self):
+         def func(args: list[torch.Tensor]) -> torch.Tensor:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch
new file mode 100644
index 000000000000..88d176f6051c
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch
@@ -0,0 +1,19 @@
+Skip test_pad_mm.py PadMMTest.test_original_aten_preserved_pad_mm failing on:
+> File "/dev/shm/pytorch-v2.9.1/test/inductor/test_pad_mm.py", line 538, in test_original_aten_preserved_pad_mm
+>   self.assertEqual(counters["inductor"]["pattern_matcher_count"], 1)
+
+See https://github.com/pytorch/pytorch/issues/170562
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
+index 781f4588e14..b6f0fcebb3c 100644
+--- a/test/inductor/test_pad_mm.py
++++ b/test/inductor/test_pad_mm.py
+@@ -508,6 +508,7 @@ class PadMMTest(TestCase):
+ 
+         assert torch.allclose(res2, mm_expected_result), "MM results are not identical"
+ 
++    @unittest.skip("Fails")
+     @fresh_cache()
+     @inductor_config.patch(
+         {
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_override-without-CUDA.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_override-without-CUDA.patch
new file mode 100644
index 000000000000..bc2b927e0a0d
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_override-without-CUDA.patch
@@ -0,0 +1,35 @@
+This test fails during creation of the tests at startup:
+>    File "/var/lib/jenkins/workspace/test/test_overrides.py", line 683, in _simple_type_parser
+>     return torch.Stream()
+> RuntimeError: CUDA error: CUDA driver version is insufficient for CUDA runtime version
+
+See https://github.com/pytorch/pytorch/pull/166625
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_overrides.py b/test/test_overrides.py
+index 8454677856d..8df233e279f 100644
+--- a/test/test_overrides.py
++++ b/test/test_overrides.py
+@@ -9,9 +9,9 @@ import pprint
+ import pickle
+ import collections
+ import unittest
+-import os
++import contextlib
+ 
+-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_CROSSREF
++from torch.testing._internal.common_utils import TestCase, run_tests, TEST_CUDA, TEST_WITH_CROSSREF
+ from torch.overrides import (
+     handle_torch_function,
+     has_torch_function,
+@@ -30,8 +30,7 @@ from torch.utils._pytree import tree_map
+ 
+ Tensor = torch.Tensor
+ 
+-if os.getenv("ATEN_CPU_CAPABILITY") in ("default", "avx2"):
+-    # This test is not supported on ARM
++if not TEST_CUDA:
+     print(
+         "Skipping due to failing when cuda build runs on non cuda machine, "
+         + "see https://github.com/pytorch/pytorch/pull/150059 for example"
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_unbacked_reduction.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_unbacked_reduction.patch
new file mode 100644
index 000000000000..bfb54615bf5e
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_unbacked_reduction.patch
@@ -0,0 +1,18 @@
+TestInductorDynamicCPU.test_unbacked_reduction_cpu doesn't only fail on ROCM with:
+> AssertionError: expected to fail, but actually passed
+
+
+See https://github.com/pytorch/pytorch/issues/154217
+
+Author: Alexander Grund (TU Dresden)
+
+--- a/test/inductor/test_torchinductor_dynamic_shapes.py
++++ b/test/inductor/test_torchinductor_dynamic_shapes.py
+@@ -513,6 +513,7 @@ class TestInductorDynamic(TestCase):
+         ).sum().backward()
+         self.assertEqual(t.grad, expect)
+ 
++    @unittest.skip("Fails on CPU")
+     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+     def test_unbacked_reduction(self, device):
+         expect_fail = (
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch
new file mode 100644
index 000000000000..a4aadc780df0
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch
@@ -0,0 +1,122 @@
+These tests use Triton to generate PTX code and then compile that with NVCC.
+
+As Triton 3.5 uses PTXAS from CUDA 12.8 it cannot be compiled with NVCC from CUDA 12.6.
+
+Failures look like: 
+> ptxas /tmp/torchinductor_s3248973/bvqcnu2o7/2mwinejhnbvqcnu2o73mk3zrx6.ptx, line 5; fatal : Unsupported .version 8.7; current version is '8.5'
+
+in following tests:
+- test_simple_multi_arch
+- test_compile_after_package_multi_arch
+- test_compile_after_package_static
+- test_compile_standalone_cos
+- test_compile_with_exporter
+- test_compile_with_exporter_weights
+
+See https://github.com/pytorch/pytorch/issues/168353
+
+Author: Alexander Grund (TU Dresden)
+
+--- a/test/inductor/test_aot_inductor.py
++++ b/test/inductor/test_aot_inductor.py
+@@ -39,7 +39,7 @@ from torch.export.pt2_archive._package import load_pt2
+ from torch.testing import FileCheck
+ from torch.testing._internal import common_utils
+ from torch.testing._internal.common_cuda import (
+-    _get_torch_cuda_version,
++    requires_triton_ptxas_compat,
+     PLATFORM_SUPPORTS_FLASH_ATTENTION,
+     PLATFORM_SUPPORTS_FP8,
+     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+@@ -239,9 +239,7 @@ class AOTInductorTestsTemplate:
+     # Skip embed_kernel_binary == True for now as it shows random
+     # failure on CI
+     @common_utils.parametrize("embed_kernel_binary", [False])
+-    @unittest.skipIf(
+-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+-    )
++    @requires_triton_ptxas_compat
+     def test_simple_multi_arch(self, embed_kernel_binary):
+         if self.device != GPU_TYPE:
+             raise unittest.SkipTest("requires GPU_TYPE")
+diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py
+index 0eb1057c802..843f63ff17d 100644
+--- a/test/inductor/test_aot_inductor_package.py
++++ b/test/inductor/test_aot_inductor_package.py
+@@ -27,7 +27,7 @@ from torch.export.pt2_archive._package import (
+     load_pt2,
+     load_weights_to_pt2_contents,
+ )
+-from torch.testing._internal.common_cuda import _get_torch_cuda_version
++from torch.testing._internal.common_cuda import _get_torch_cuda_version, requires_triton_ptxas_compat
+ from torch.testing._internal.common_utils import (
+     IS_FBCODE,
+     skipIfRocm,
+@@ -319,9 +319,7 @@ class TestAOTInductorPackage(TestCase):
+                 actual = optimized(*example_inputs)
+                 self.assertTrue(torch.allclose(actual, expected))
+ 
+-    @unittest.skipIf(
+-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+-    )
++    @requires_triton_ptxas_compat
+     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+     @skipIfRocm  # doesn't support multi-arch binary
+     @skipIfXpu  # doesn't support multi-arch binary
+@@ -366,9 +364,7 @@ class TestAOTInductorPackage(TestCase):
+                 actual = optimized(*example_inputs)
+                 self.assertTrue(torch.allclose(actual, expected))
+ 
+-    @unittest.skipIf(
+-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+-    )
++    @requires_triton_ptxas_compat
+     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+     @skipIfXpu  # build system may be different
+     @torch._inductor.config.patch("test_configs.use_libtorch", True)
+@@ -429,6 +425,7 @@ class TestAOTInductorPackage(TestCase):
+                 self.cmake_compile(model, example_inputs, options, "")
+ 
+     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
++    @requires_triton_ptxas_compat
+     @skipIfXpu  # build system may be different
+     @torch._inductor.config.patch("test_configs.use_libtorch", True)
+     def test_compile_standalone_cos(self):
+@@ -461,9 +458,7 @@ class TestAOTInductorPackage(TestCase):
+                 a_path = build_path / "libcos.a"
+                 self.assertTrue(a_path.exists())
+ 
+-    @unittest.skipIf(
+-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+-    )
++    @requires_triton_ptxas_compat
+     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+     @skipIfRocm  # doesn't support multi-arch binary
+     @skipIfXpu  # doesn't support multi-arch binary
+@@ -519,9 +514,7 @@ class TestAOTInductorPackage(TestCase):
+                             " 0  0  0\n 0  0  0\n[ CPUFloatType{3,3} ]\n",
+                         )
+ 
+-    @unittest.skipIf(
+-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+-    )
++    @requires_triton_ptxas_compat
+     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+     @skipIfRocm  # doesn't support multi-arch binary
+     @skipIfXpu  # doesn't support multi-arch binary
+diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
+index be284429114..3bd0e0a904f 100644
+--- a/torch/testing/_internal/common_cuda.py
++++ b/torch/testing/_internal/common_cuda.py
+@@ -373,6 +373,11 @@ def xfailIfSM120OrLater(func):
+ def xfailIfDistributedNotSupported(func):
+     return func if not (IS_MACOS or IS_JETSON) else unittest.expectedFailure(func)
+ 
++# When using nvcc from the CUDA toolkit its versuib must be at least the one from ptxas bundled with Triton
++TRITON_PTXAS_VERSION = (12, 8)
++requires_triton_ptxas_compat = unittest.skipIf(torch.version.hip is None and _get_torch_cuda_version() < TRITON_PTXAS_VERSION,
++                                               "Requires CUDA 12.8 to match Tritons ptxas version")
++
+ # Importing this module should NOT eagerly initialize CUDA
+ if not CUDA_ALREADY_INITIALIZED_ON_IMPORT:
+     assert not torch.cuda.is_initialized()
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch
new file mode 100644
index 000000000000..3667657cc175
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch
@@ -0,0 +1,104 @@
+Unexpected success in e.g. TestExportOpInfoCPU.test_fake_export___getitem___cpu_float32
+
+Same with PYPI package and reported in https://github.com/pytorch/pytorch/pull/164166
+
+Skip all instead of XFailing
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/export/test_export_opinfo.py b/test/export/test_export_opinfo.py
+index 35d8b2895bd..409a305a3aa 100644
+--- a/test/export/test_export_opinfo.py
++++ b/test/export/test_export_opinfo.py
+@@ -22,54 +22,54 @@ from torch.utils import _pytree as pytree
+ 
+ # following are failing with regular torch.export.export
+ export_failures = {
+-    xfail("allclose"),
+-    xfail("combinations"),
+-    xfail("corrcoef"),
+-    xfail("cov"),
+-    xfail("equal"),
+-    xfail("linalg.lstsq"),
+-    xfail("linalg.lstsq", "grad_oriented"),
+-    xfail("nn.functional.ctc_loss"),
+-    xfail("nn.functional.gaussian_nll_loss"),
+-    xfail("sparse.sampled_addmm"),
+-    xfail("tensor_split"),
++    skip("allclose"),
++    skip("combinations"),
++    skip("corrcoef"),
++    skip("cov"),
++    skip("equal"),
++    skip("linalg.lstsq"),
++    skip("linalg.lstsq", "grad_oriented"),
++    skip("nn.functional.ctc_loss"),
++    skip("nn.functional.gaussian_nll_loss"),
++    skip("sparse.sampled_addmm"),
++    skip("tensor_split"),
+ }
+ 
+ # following are failing fake export on cuda device
+ fake_export_failures = {
+-    xfail("geqrf"),
+-    xfail("histogram"),
+-    xfail("masked.amax"),
+-    xfail("masked.amin"),
+-    xfail("masked.argmax"),
+-    xfail("masked.argmin"),
+-    xfail("masked.logaddexp"),
+-    xfail("masked.logsumexp"),
+-    xfail("masked.mean"),
+-    xfail("masked.prod"),
+-    xfail("masked.std"),
+-    xfail("masked.sum"),
+-    xfail("masked.var"),
+-    xfail("nn.functional.grid_sample"),
+-    xfail("to_sparse"),
++    skip("geqrf"),
++    skip("histogram"),
++    skip("masked.amax"),
++    skip("masked.amin"),
++    skip("masked.argmax"),
++    skip("masked.argmin"),
++    skip("masked.logaddexp"),
++    skip("masked.logsumexp"),
++    skip("masked.mean"),
++    skip("masked.prod"),
++    skip("masked.std"),
++    skip("masked.sum"),
++    skip("masked.var"),
++    skip("nn.functional.grid_sample"),
++    skip("to_sparse"),
+     # cannot xfail as it is passing for cpu-only build
+     skip("nn.functional.conv2d"),
+     skip("nn.functional.scaled_dot_product_attention"),
+     # following are failing due to OptionalDeviceGuard
+-    xfail("__getitem__"),
+-    xfail("nn.functional.batch_norm"),
+-    xfail("nn.functional.instance_norm"),
+-    xfail("nn.functional.multi_margin_loss"),
+-    xfail("nonzero"),
++    skip("__getitem__"),
++    skip("nn.functional.batch_norm"),
++    skip("nn.functional.instance_norm"),
++    skip("nn.functional.multi_margin_loss"),
++    skip("nonzero"),
+ }
+ 
+ fake_decomposition_failures = {
+-    xfail("linalg.matrix_rank"),
+-    xfail("nn.functional.binary_cross_entropy_with_logits"),
+-    xfail("nn.functional.instance_norm"),
+-    xfail("nn.functional.multi_margin_loss"),
+-    xfail("repeat_interleave"),
+-    xfail("take"),
++    skip("linalg.matrix_rank"),
++    skip("nn.functional.binary_cross_entropy_with_logits"),
++    skip("nn.functional.instance_norm"),
++    skip("nn.functional.multi_margin_loss"),
++    skip("repeat_interleave"),
++    skip("take"),
+ }
+ 
+ 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
new file mode 100644
index 000000000000..7e00bf62fe7e
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -0,0 +1,220 @@
+name = 'PyTorch'
+version = '2.9.1'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+
+toolchain = {'name': 'foss', 'version': '2025b'}
+
+local_six_version = '1.11.0'
+source_urls = [GITHUB_RELEASE]
+sources = [
+    '%(namelower)s-v%(version)s.tar.gz',
+    {
+        # Avoid downloading this during the build, see third_party/NNPACK/cmake/DownloadSix.cmake for the version
+        'filename': f'six-{local_six_version}.tar.gz',
+        'source_urls': [
+            'https://pypi.python.org/packages/16/d8/bc6316cf98419719bd59c91742194c111b6f2e85abac88e496adefaf7afe'],
+    }
+]
+patches = [
+    'PyTorch-1.12.1_add-hypothesis-suppression.patch',
+    'PyTorch-1.7.0_disable-dev-shm-test.patch',
+    'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
+    'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
+    'PyTorch-2.6.0_show-test-duration.patch',
+    'PyTorch-2.6.0_skip-test_segfault.patch',
+    'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch',
+    'PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch',
+    'PyTorch-2.7.1_skip-test_data_parallel_rnn.patch',
+    'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch',
+    'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch',
+    'PyTorch-2.7.1_skip-tests-requiring-SM90.patch',
+    'PyTorch-2.7.1_suport-64bit-BARs.patch',
+    'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch',
+    'PyTorch-2.9.0_disable-test_nan_assert.patch',
+    'PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch',
+    'PyTorch-2.9.0_fix-attention-squeeze.patch',
+    'PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch',
+    'PyTorch-2.9.0_fix-nccl-test-env.patch',
+    'PyTorch-2.9.0_fix-test_exclude_padding.patch',
+    'PyTorch-2.9.0_fix-test_version_error.patch',
+    'PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch',
+    'PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch',
+    'PyTorch-2.9.0_remove-faulty-close.patch',
+    'PyTorch-2.9.0_revert-pybind11-3-change.patch',
+    'PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch',
+    'PyTorch-2.9.0_skip-test_convolution1-on-H100.patch',
+    'PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch',
+    'PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch',
+    'PyTorch-2.9.0_skip-test_override-without-CUDA.patch',
+    'PyTorch-2.9.0_skip-test_unbacked_reduction.patch',
+    'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch',
+    'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch',
+    'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
+]
+checksums = [
+    {'pytorch-v2.9.1.tar.gz': 'e17504700ebc4c87f9b57059df1c4d790b769458c04db144c7a92aea90f2c92b'},
+    {'six-1.11.0.tar.gz': '70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9'},
+    {'PyTorch-1.12.1_add-hypothesis-suppression.patch':
+     'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
+    {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
+    {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
+     '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
+    {'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
+     '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
+    {'PyTorch-2.6.0_show-test-duration.patch': '5508f2f9619204d9f3c356dbd4000a00d58f452ab2d64ae920eb8bc8b5484d75'},
+    {'PyTorch-2.6.0_skip-test_segfault.patch': '26806bd62e6b61b56ebaa52d68ca44c415a28124f684bd2fb373557ada68ef52'},
+    {'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch':
+     '2f3255e067f5c6f0d78b4fbce94784c41bddf3d01bab9673856b0d0bbc4e3fec'},
+    {'PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch':
+     'aaf22cb431357dc78e4db895d64febf1c7ee187e8ad27bd13544d011127354d4'},
+    {'PyTorch-2.7.1_skip-test_data_parallel_rnn.patch':
+     'aa85b678e89db4bb41d2c5f4990f0d05959be92e61918291cb5609685b7f1841'},
+    {'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch':
+     '503030c3591196510a3c2d95db30b28a0b396adb8b50ff0d221f6bdb1f939935'},
+    {'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch':
+     '709288abc802c9eb687c15f2677ebaf408d8325a4cb470d23cb72447ee0b8e13'},
+    {'PyTorch-2.7.1_skip-tests-requiring-SM90.patch':
+     '7b5891a96b58d1d404c130233ec5ddbb0ad52afdb9c334bbe4e1f27f6c78ffd8'},
+    {'PyTorch-2.7.1_suport-64bit-BARs.patch': '317c3d220aa87426d86e137a6c1a8f910adf9580ca0848371e0f6800c05dbde1'},
+    {'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch':
+     'f304440a57e00b8052a5ffbf285adad8d0fdc5a812a659420b59a20deb5a9942'},
+    {'PyTorch-2.9.0_disable-test_nan_assert.patch': '98e9f98ce8fb89ae368739bc039be69040ed446a1c74ee5c2a1ef8ba60986c7d'},
+    {'PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch':
+     'ba4032b967c0393c916a26fb2b117ba40670ae8e809cb34399a6379b4e523d72'},
+    {'PyTorch-2.9.0_fix-attention-squeeze.patch': '8f040e74780cab391bb4c84f86390a13230e1a309ddf65db9900d9a1c66e1288'},
+    {'PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch':
+     'b696d7be8c55ff1ccf8731dccf119b8792cd9593eaff457f37e76114e52346d2'},
+    {'PyTorch-2.9.0_fix-nccl-test-env.patch': '9326223c400262788734ec608f6134c5d240f4d5315a8d294179a28f885d6845'},
+    {'PyTorch-2.9.0_fix-test_exclude_padding.patch':
+     '349850874fb75d57a24437d871a4994a773e501632ce66a2adca613380a152dc'},
+    {'PyTorch-2.9.0_fix-test_version_error.patch': 'b10bb10d0a353e4ba7dbef28ca5fef03a8ba552896e1982708aa90ab6f24f34f'},
+    {'PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch': '239631258431174e4aed8947ae6096e003a3213bfbfa112cd0cdebae89469164'},
+    {'PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch':
+     'c27ab34900835c2a15edc26d481343a16433bfa52f635a80cbab252c1320a545'},
+    {'PyTorch-2.9.0_remove-faulty-close.patch': '32ca744d68dcfa669e46ced9d2776af3dcc380dd9c3458ba7c1c432e5c5295b3'},
+    {'PyTorch-2.9.0_revert-pybind11-3-change.patch':
+     '5289894011fefc67482b1e19c9d1c502e94a943fc7a2d5ed5a6a1eaf444570a0'},
+    {'PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch':
+     '85e236431d1a5da3fb7fccc2554640898c29f5fab46a41d15b3ab61dd1f924fc'},
+    {'PyTorch-2.9.0_skip-test_convolution1-on-H100.patch':
+     '704750c7cc08b58779907d608cd4b7505043e394fb27530b16d72a0dc27c277e'},
+    {'PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch':
+     '644153d4c1d8267c0631df2902a6dfe8ec2a197f3374f2a2f5654e6bd0edc05e'},
+    {'PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch':
+     'ac9e05d296cd5ff938a44662cd022efcc8133c744ca82b045c6a15bc64f67cf4'},
+    {'PyTorch-2.9.0_skip-test_override-without-CUDA.patch':
+     '967512d1487bf1ad06982cc5b976c0b38ba062c3f3473cb4542c4b9ac0740662'},
+    {'PyTorch-2.9.0_skip-test_unbacked_reduction.patch':
+     'b51dd5d7c9cfeed946cbc5c7fc22f2e78e1fa52dda55569b957c20ca4ed01fe8'},
+    {'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch':
+     '6d79aff5291627b86d8fea025bf2379e4065c7d9cbef5cf83452c35922848728'},
+    {'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch':
+     '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
+    {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
+     '3cf0b11136fb18c45072687eafd3024d91b504d231a4fa40e04bc62d8d6019c7'},
+]
+
+osdependencies = [OS_PKG_IBVERBS_DEV]
+
+builddependencies = [
+    ('CMake', '3.31.8'),
+    ('hypothesis', '6.136.6'),
+    # For tests
+    ('parameterized', '0.9.0'),
+    ('pytest-flakefinder', '1.1.0'),
+    ('pytest-rerunfailures', '16.1'),
+    ('pytest-shard', '0.1.2'),
+    ('pytest-subtests', '0.15.0'),
+    ('tlparse', '0.4.3'),
+    ('optree', '0.18.0'),
+    ('unittest-xml-reporting', '3.2.0'),
+]
+
+dependencies = [
+    ('CUDA', '12.9.1', '', SYSTEM),
+    # PyTorch is very sensitive to the NCCL & cuDNN versions. (Maybe the same for cuSPARSELt)
+    # Prefer those (listed per CUDA version) in
+    # https://github.com/pytorch/pytorch/blob/main/.github/scripts/generate_binary_build_matrix.py
+    # or https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_cuda.sh
+    ('NCCL', '2.27.7', '-CUDA-%(cudaver)s'),
+    ('cuDNN', '9.15.0.57', '-CUDA-%(cudaver)s', SYSTEM),
+    ('magma', '2.9.0', '-CUDA-%(cudaver)s'),
+    ('cuSPARSELt', '0.8.0.4', '-CUDA-%(cudaver)s', SYSTEM),
+    ('Triton', '3.5.0', '-CUDA-%(cudaver)s'),
+    ('Ninja', '1.13.0'),
+    ('Python', '3.13.5'),
+    ('Python-bundle-PyPI', '2025.07'),
+    ('expecttest', '0.3.0'),
+    ('GMP', '6.3.0'),
+    ('MPFR', '4.2.2'),
+    ('networkx', '3.5'),
+    ('numactl', '2.0.19'),
+    ('Pillow', '11.3.0'),
+    ('protobuf-python', '6.31.1'),
+    ('protobuf', '31.1'),
+    ('pybind11', '3.0.0'),
+    ('PuLP', '3.3.0'),
+    ('PyYAML', '6.0.2'),
+    ('pyzstd', '0.19.0'),
+    ('SciPy-bundle', '2025.07'),
+    ('sympy', '1.14.0'),
+    ('Z3', '4.15.1'),
+]
+
+prebuildopts = (f"""sed -i '1i set(PYTHON_SIX_SOURCE_DIR "%(builddir)s/six-{local_six_version}")' """
+                "cmake/Dependencies.cmake && ")
+buildcmd = '%(python)s setup.py build'  # Run the (long) build in the build step
+
+excluded_tests = {
+    '': [
+        # This test seems to take too long on NVIDIA Ampere at least.
+        'distributed/test_distributed_spawn',
+        # no xdoctest
+        'doctests',
+        # intermittent failures on various systems
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'distributed/rpc/test_tensorpipe_agent',
+        # This test is expected to fail when run in their CI, but won't in our case.
+        # It just checks for a "CI" env variable
+        'test_ci_sanity_check_fail',
+        # Requires pwlf Python package
+        'distributed/_tools/test_sac_ilp', 'distributed/_tools/test_sac_estimator',
+        # 9 failures in H100, 7 are present in PYPI package, 2 are related to GC in Python < 3.12.4
+        'dynamo/test_dynamic_shapes',
+        # Broken test: https://github.com/pytorch/pytorch/issues/162179
+        'distributed/_composable/fsdp/test_fully_shard_logging',
+        # Broken: https://github.com/pytorch/pytorch/issues/137027
+        'inductor/test_extension_backend',
+        # Requires optional Python packages
+        'test_public_bindings',
+        # 1 Failure and not important
+        'dynamo/test_utils',
+        # Packaging test only, not important for us
+        'test_license',
+    ]
+}
+
+runtest = (
+    ' TORCH_DISABLE_ADDR2LINE=1'
+    ' TORCHINDUCTOR_CUTLASS_DIR=%(start_dir)s/third_party/cutlass'
+    ' PYTEST_ADDOPTS=--full-trace'
+    ' PYTHONUNBUFFERED=1'
+    ' %(python)s test/run_test.py'
+    ' --continue-through-error --pipe-logs --verbose'
+    ' %(excluded_tests)s'
+)
+
+postinstallcmds = [
+    "mkdir %(installdir)s/extra",
+    "cp -r third_party/cutlass %(installdir)s/extra/",
+]
+
+modextrapaths = {'TORCHINDUCTOR_CUTLASS_DIR': 'extra/cutlass'}
+
+tests = ['PyTorch-check-cpp-extension.py', 'PyTorch-check-cutlass.py']
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-RingFlexAttentionTest.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-RingFlexAttentionTest.patch
new file mode 100644
index 000000000000..7855d55ddafd
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-RingFlexAttentionTest.patch
@@ -0,0 +1,23 @@
+test_ring_flex_attention and test_ring_flex_attention_mask both fail in similar ways:
+
+> torch._dynamo.exc.Unsupported: Attempted to call function marked as skipped
+> ...
+>  Developer debug context: module: _warnings, qualname: warn, skip reason: <missing reason>
+
+See https://github.com/pytorch/pytorch/pull/161667#issuecomment-3298676991 
+    & https://github.com/pytorch/pytorch/issues/162843
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py
+index a2543d443e4..a28fb45e992 100644
+--- a/test/distributed/tensor/test_attention.py
++++ b/test/distributed/tensor/test_attention.py
+@@ -531,6 +531,7 @@ def generate_doc_mask_mod(
+     return doc_mask_mod
+ 
+ 
++@unittest.skip("FAILS")
+ class RingFlexAttentionTest(DTensorTestBase):
+     @property
+     def world_size(self) -> int:
diff --git a/easybuild/easyconfigs/p/parameterized/parameterized-0.9.0-GCCcore-14.3.0.eb b/easybuild/easyconfigs/p/parameterized/parameterized-0.9.0-GCCcore-14.3.0.eb
new file mode 100644
index 000000000000..c9f5dbfe09f4
--- /dev/null
+++ b/easybuild/easyconfigs/p/parameterized/parameterized-0.9.0-GCCcore-14.3.0.eb
@@ -0,0 +1,18 @@
+easyblock = 'PythonPackage'
+
+name = 'parameterized'
+version = '0.9.0'
+
+homepage = 'https://github.com/wolever/parameterized'
+description = " Parameterized testing with any Python test framework "
+
+toolchain = {'name': 'GCCcore', 'version': '14.3.0'}
+
+sources = [SOURCE_TAR_GZ]
+checksums = ['7fc905272cefa4f364c1a3429cbbe9c0f98b793988efb5bf90aac80f08db09b1']
+
+builddependencies = [('binutils', '2.44')]
+
+dependencies = [('Python', '3.13.5')]
+
+moduleclass = 'tools'
diff --git a/easybuild/easyconfigs/p/pytest-subtests/pytest-subtests-0.15.0-GCCcore-14.3.0.eb b/easybuild/easyconfigs/p/pytest-subtests/pytest-subtests-0.15.0-GCCcore-14.3.0.eb
new file mode 100644
index 000000000000..8e3dac2e1a4d
--- /dev/null
+++ b/easybuild/easyconfigs/p/pytest-subtests/pytest-subtests-0.15.0-GCCcore-14.3.0.eb
@@ -0,0 +1,22 @@
+easyblock = 'PythonPackage'
+
+name = 'pytest-subtests'
+version = '0.15.0'
+
+homepage = 'https://github.com/pytest-dev/pytest-subtests'
+description = "unittest subTest() support and subtests fixture."
+
+toolchain = {'name': 'GCCcore', 'version': '14.3.0'}
+
+sources = ['pytest_subtests-%(version)s.tar.gz']
+checksums = ['cb495bde05551b784b8f0b8adfaa27edb4131469a27c339b80fd8d6ba33f887c']
+
+builddependencies = [
+    ('binutils', '2.44'),
+]
+dependencies = [
+    ('Python', '3.13.5'),
+    ('Python-bundle-PyPI', '2025.07'),
+]
+
+moduleclass = 'tools'
diff --git a/easybuild/easyconfigs/u/unittest-xml-reporting/unittest-xml-reporting-3.2.0-GCCcore-14.3.0.eb b/easybuild/easyconfigs/u/unittest-xml-reporting/unittest-xml-reporting-3.2.0-GCCcore-14.3.0.eb
new file mode 100644
index 000000000000..bb0f2e3510e9
--- /dev/null
+++ b/easybuild/easyconfigs/u/unittest-xml-reporting/unittest-xml-reporting-3.2.0-GCCcore-14.3.0.eb
@@ -0,0 +1,23 @@
+easyblock = 'PythonPackage'
+
+name = 'unittest-xml-reporting'
+version = '3.2.0'
+
+homepage = 'http://github.com/xmlrunner/unittest-xml-reporting'
+description = """A unittest test runner that can save test results to XML files in xUnit format.
+The files can be consumed by a wide range of tools, such as build systems, IDEs and continuous integration servers."""
+
+toolchain = {'name': 'GCCcore', 'version': '14.3.0'}
+
+sources = [SOURCE_TAR_GZ]
+checksums = ['edd8d3170b40c3a81b8cf910f46c6a304ae2847ec01036d02e9c0f9b85762d28']
+
+builddependencies = [('binutils', '2.44')]
+dependencies = [
+    ('Python', '3.13.5'),
+    ('lxml', '6.0.0'),
+]
+
+options = {'modulename': 'xmlrunner'}
+
+moduleclass = 'tools'

From 5e96b2513a2f048acc8edff9d3338226f4079b48 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 19 Dec 2025 09:57:57 +0100
Subject: [PATCH 02/30] Add testcase

---
 .../p/PyTorch/PyTorch-check-cutlass.py        | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100755 easybuild/easyconfigs/p/PyTorch/PyTorch-check-cutlass.py

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-check-cutlass.py b/easybuild/easyconfigs/p/PyTorch/PyTorch-check-cutlass.py
new file mode 100755
index 000000000000..73d9951b78ac
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-check-cutlass.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+
+# Verify that PyTorch can load CUTLASS, required for the CUTLASS inductor backend
+# Author: Alexander Grund (TU Dresden)
+
+import os
+import tempfile
+from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass, config
+
+# Isolate from default path used
+os.environ['TORCHINDUCTOR_CACHE_DIR'] = tempfile.mkdtemp(suffix='inductor_cache')
+# Use empty working directory
+os.chdir(tempfile.mkdtemp(suffix='cwd'))
+
+
+if try_import_cutlass():
+    print(f"CUTLASS is set up using {config.cuda.cutlass_dir}")
+else:
+    raise RuntimeError("CUTLASS is NOT working")

From 9f728f67d23a4971c5ad3472cc65436a056f37cc Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 19 Dec 2025 13:50:12 +0100
Subject: [PATCH 03/30] Add patch for GCC 14 ARM builds

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  3 +++
 ...e-warning-incompatible-pointer-types.patch | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 7e00bf62fe7e..5fdbdb910c18 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -53,6 +53,7 @@ patches = [
     'PyTorch-2.9.0_skip-test_unbacked_reduction.patch',
     'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch',
     'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch',
+    'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch',
     'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
 ]
 checksums = [
@@ -114,6 +115,8 @@ checksums = [
      '6d79aff5291627b86d8fea025bf2379e4065c7d9cbef5cf83452c35922848728'},
     {'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch':
      '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
+    {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
+     '7e62576f7f2b4b7c023ad9d59ec5aef09e9bf5a7b78a0e5990956567eed85f73'},
     {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
      '3cf0b11136fb18c45072687eafd3024d91b504d231a4fa40e04bc62d8d6019c7'},
 ]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
new file mode 100644
index 000000000000..a9340f035361
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
@@ -0,0 +1,19 @@
+Silence a warning that fails builds with GCC 14, especially in XNNPACK.
+See https://github.com/pytorch/pytorch/pull/166873
+
+Applied more broadly as we don't care about warnings anyway.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index ce7890f002d..eb9e7a682c6 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1040,6 +1040,7 @@ if(NOT MSVC)
+   append_cxx_flag_if_supported("-Wno-strict-overflow" CMAKE_CXX_FLAGS)
+   append_cxx_flag_if_supported("-Wno-strict-aliasing" CMAKE_CXX_FLAGS)
+   append_cxx_flag_if_supported("-Wno-stringop-overflow" CMAKE_CXX_FLAGS)
++  append_cxx_flag_if_supported("-Wno-incompatible-pointer-types" CMAKE_CXX_FLAGS)
+   append_cxx_flag_if_supported("-Wvla-extension" CMAKE_CXX_FLAGS)
+   append_cxx_flag_if_supported("-Wsuggest-override" CMAKE_CXX_FLAGS)
+   append_cxx_flag_if_supported("-Wnewline-eof" CMAKE_CXX_FLAGS)

From 54f0441482ebe4492534a30304cf9b2c7d5545cd Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 19 Dec 2025 14:27:09 +0100
Subject: [PATCH 04/30] Also ignore warning for C files

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  2 +-
 ...e-warning-incompatible-pointer-types.patch | 22 +++++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 5fdbdb910c18..803a79e1ec37 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -116,7 +116,7 @@ checksums = [
     {'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch':
      '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
-     '7e62576f7f2b4b7c023ad9d59ec5aef09e9bf5a7b78a0e5990956567eed85f73'},
+     '59c84af01a76afd5462f4286de3898630f23645ee813a4da366ca7fbf5d8065d'},
     {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
      '3cf0b11136fb18c45072687eafd3024d91b504d231a4fa40e04bc62d8d6019c7'},
 ]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
index a9340f035361..e2ced872720e 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
@@ -6,14 +6,18 @@ Applied more broadly as we don't care about warnings anyway.
 Author: Alexander Grund (TU Dresden)
 
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index ce7890f002d..eb9e7a682c6 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -1040,6 +1040,7 @@ if(NOT MSVC)
-   append_cxx_flag_if_supported("-Wno-strict-overflow" CMAKE_CXX_FLAGS)
-   append_cxx_flag_if_supported("-Wno-strict-aliasing" CMAKE_CXX_FLAGS)
-   append_cxx_flag_if_supported("-Wno-stringop-overflow" CMAKE_CXX_FLAGS)
-+  append_cxx_flag_if_supported("-Wno-incompatible-pointer-types" CMAKE_CXX_FLAGS)
-   append_cxx_flag_if_supported("-Wvla-extension" CMAKE_CXX_FLAGS)
-   append_cxx_flag_if_supported("-Wsuggest-override" CMAKE_CXX_FLAGS)
-   append_cxx_flag_if_supported("-Wnewline-eof" CMAKE_CXX_FLAGS)
+@@ -1056,6 +1056,12 @@ if(NOT MSVC)
+     string(APPEND CMAKE_CXX_FLAGS " -Wno-psabi")
+   endif()
+ 
++  if(CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "14")
++    foreach(xnn_tgt IN ITEMS XNNPACK microkernels-prod microkernels-all)
++      string(APPEND CMAKE_CXX_FLAGS " -Wno-incompatible-pointer-types")
++      string(APPEND CMAKE_C_FLAGS " -Wno-incompatible-pointer-types")
++    endforeach()
++  endif()
+   # Use ld.gold if available, fall back to ld.bfd (the default ld) if not
+   if(USE_GOLD_LINKER)
+     if(USE_DISTRIBUTED AND USE_MPI)

From b9213995ae201e7411d58d92befb56473d49b994 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 19 Dec 2025 15:19:35 +0100
Subject: [PATCH 05/30] Move flags setting before including dependencies

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  2 +-
 ...e-warning-incompatible-pointer-types.patch | 24 +++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 803a79e1ec37..8eb5345d5145 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -116,7 +116,7 @@ checksums = [
     {'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch':
      '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
-     '59c84af01a76afd5462f4286de3898630f23645ee813a4da366ca7fbf5d8065d'},
+     'd6a3fc21de154f54ba6504f8bd4b2eca5d05bc3d1ef3fd8eb57a9e167f852eed'},
     {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
      '3cf0b11136fb18c45072687eafd3024d91b504d231a4fa40e04bc62d8d6019c7'},
 ]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
index e2ced872720e..b9231002baee 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
@@ -8,16 +8,16 @@ Author: Alexander Grund (TU Dresden)
 diff --git a/CMakeLists.txt b/CMakeLists.txt
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -1056,6 +1056,12 @@ if(NOT MSVC)
-     string(APPEND CMAKE_CXX_FLAGS " -Wno-psabi")
-   endif()
+@@ -852,6 +852,12 @@ if(MSVC)
+   append_cxx_flag_if_supported("/utf-8" CMAKE_CXX_FLAGS)
+ endif()
  
-+  if(CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "14")
-+    foreach(xnn_tgt IN ITEMS XNNPACK microkernels-prod microkernels-all)
-+      string(APPEND CMAKE_CXX_FLAGS " -Wno-incompatible-pointer-types")
-+      string(APPEND CMAKE_C_FLAGS " -Wno-incompatible-pointer-types")
-+    endforeach()
-+  endif()
-   # Use ld.gold if available, fall back to ld.bfd (the default ld) if not
-   if(USE_GOLD_LINKER)
-     if(USE_DISTRIBUTED AND USE_MPI)
++if(CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "14")
++  string(APPEND CMAKE_CXX_FLAGS " -Wno-incompatible-pointer-types")
++  string(APPEND CMAKE_C_FLAGS " -Wno-incompatible-pointer-types")
++endif()
++
++
+ # Note for ROCM platform: 1. USE_ROCM is always ON until
+ # include(cmake/Dependencies.cmake) 2. USE_CUDA will become OFF during
+ # re-configuration Truth Table: CUDA 1st pass: USE_CUDA=True;USE_ROCM=True,

From dd7464f0903ebdcf215664e337d32d4a0b1ab61b Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 19 Dec 2025 16:35:37 +0100
Subject: [PATCH 06/30] Use flag only for C

---
 .../p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb          | 2 +-
 ...Torch-2.9.1_ignore-warning-incompatible-pointer-types.patch | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 8eb5345d5145..40d8eb7029e4 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -116,7 +116,7 @@ checksums = [
     {'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch':
      '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
-     'd6a3fc21de154f54ba6504f8bd4b2eca5d05bc3d1ef3fd8eb57a9e167f852eed'},
+     'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
     {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
      '3cf0b11136fb18c45072687eafd3024d91b504d231a4fa40e04bc62d8d6019c7'},
 ]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
index b9231002baee..cebc1478b59f 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
@@ -8,12 +8,11 @@ Author: Alexander Grund (TU Dresden)
 diff --git a/CMakeLists.txt b/CMakeLists.txt
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -852,6 +852,12 @@ if(MSVC)
+@@ -852,6 +852,11 @@ if(MSVC)
    append_cxx_flag_if_supported("/utf-8" CMAKE_CXX_FLAGS)
  endif()
  
 +if(CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "14")
-+  string(APPEND CMAKE_CXX_FLAGS " -Wno-incompatible-pointer-types")
 +  string(APPEND CMAKE_C_FLAGS " -Wno-incompatible-pointer-types")
 +endif()
 +

From 54b64effc3e916572da5e11812c19462a3281241 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 19 Dec 2025 17:05:38 +0100
Subject: [PATCH 07/30] Add workaround for GCC 14 ICE

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |   3 +
 ...yTorch-2.9.1_workaround-GCC-14-error.patch | 136 ++++++++++++++++++
 2 files changed, 139 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_workaround-GCC-14-error.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 40d8eb7029e4..8ceb58c33711 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -55,6 +55,7 @@ patches = [
     'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch',
     'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch',
     'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
+    'PyTorch-2.9.1_workaround-GCC-14-error.patch',
 ]
 checksums = [
     {'pytorch-v2.9.1.tar.gz': 'e17504700ebc4c87f9b57059df1c4d790b769458c04db144c7a92aea90f2c92b'},
@@ -119,6 +120,8 @@ checksums = [
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
     {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
      '3cf0b11136fb18c45072687eafd3024d91b504d231a4fa40e04bc62d8d6019c7'},
+    {'PyTorch-2.9.1_workaround-GCC-14-error.patch':
+     '27f5ccee07cdb5ffe134a7a50de0608a6eb8723684eb0fa5dbdba6590137bcbb'},
 ]
 
 osdependencies = [OS_PKG_IBVERBS_DEV]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_workaround-GCC-14-error.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_workaround-GCC-14-error.patch
new file mode 100644
index 000000000000..d0f7cc80f15b
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_workaround-GCC-14-error.patch
@@ -0,0 +1,136 @@
+From d6237721c074484ea5e72fc05614587886e57fd6 Mon Sep 17 00:00:00 2001
+From: Nikita Shulga <nshulga@meta.com>
+Date: Tue, 8 Jul 2025 18:47:20 -0700
+Subject: [PATCH] [Build] Make PyTorch compilable with gcc-14 on ARM (#157867)
+
+Fixes numerous ICEs in vreg allocations for SVE+BF16
+```
+/pytorch/aten/src/ATen/ParallelOpenMP.h:25:9: error: unrecognizable insn:
+   25 | #pragma omp parallel
+      |         ^~~
+(insn 257 256 258 30 (set (reg:VNx8BF 449 [ bf16_vec1_217 ])
+        (unspec:VNx8BF [
+                (reg:VNx8BF 455)
+                (reg:VNx8BF 456)
+            ] UNSPEC_IORF)) "/pytorch/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h":228:31 discrim 1 -1
+     (nil))
+during RTL pass: vregs
+/pytorch/aten/src/ATen/ParallelOpenMP.h:25:9: internal compiler error: in extract_insn, at recog.cc:2812
+0xd73c33 internal_error(char const*, ...)
+	???:0
+0xd73d1f fancy_abort(char const*, int, char const*)
+	???:0
+0x890053 _fatal_insn(char const*, rtx_def const*, char const*, int, char const*)
+	???:0
+0x890087 _fatal_insn_not_found(rtx_def const*, char const*, int, char const*)
+	???:0
+0x1379093 extract_insn(rtx_insn*)
+	???:0
+
+```
+And one in RTL-expand pass while compiling Activation.cpp
+```
+during RTL pass: expand
+In file included from /pytorch/aten/src/ATen/native/cpu/Activation.cpp:12,
+                 from /pytorch/build/aten/src/ATen/native/cpu/Activation.cpp.DEFAULT.cpp:1:
+/pytorch/aten/src/ATen/native/cpu/Activation.cpp: In lambda function:
+/pytorch/aten/src/ATen/native/cpu/Activation.cpp:94:7: internal compiler error: Segmentation fault
+   94 |       });
+      |       ^
+/pytorch/aten/src/ATen/Dispatch.h:201:7: note: in definition of macro 'AT_DISPATCH_SWITCH'
+  201 |       __VA_ARGS__                                                           \
+      |       ^~~~~~~~~~~
+/pytorch/aten/src/ATen/Dispatch.h:72:3: note: in expansion of macro 'AT_PRIVATE_CASE_TYPE_USING_HINT'
+   72 |   AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/pytorch/aten/src/ATen/Dispatch.h:214:3: note: in expansion of macro 'AT_DISPATCH_CASE'
+  214 |   AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
+      |   ^~~~~~~~~~~~~~~~
+/pytorch/aten/src/ATen/Dispatch.h:218:34: note: in expansion of macro 'AT_DISPATCH_CASE_FLOATING_TYPES'
+  218 |   AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+      |                                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/pytorch/aten/src/ATen/native/cpu/Activation.cpp:70:5: note: in expansion of macro 'AT_DISPATCH_FLOATING_TYPES'
+   70 |     AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&] {
+      |     ^~~~~~~~~~~~~~~~~~~~~~~~~~
+0xd73c33 internal_error(char const*, ...)
+	???:0
+0x134f987 rebuild_jump_labels(rtx_insn*)
+	???:0
+```
+
+Interestingly enough, attempt to compile `Unfold2d.cpp` for `-march=armv8-a+sve` (i.e. without sve+bf16) support also causes ICE
+```
+/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp:221:1: error: unrecognizable insn:
+  221 | }
+      | ^
+(insn 2918 2917 2919 296 (set (reg:VNx8BI 5917)
+        (unspec:VNx16BI [
+                (reg:VNx8BI 5920)
+                (reg:VNx8BI 5922)
+                (const_vector:VNx4BI [
+                        (const_int 0 [0]) repeated x8
+                    ])
+            ] UNSPEC_TRN1_CONV)) "/usr/include/aarch64-linux-gnu/bits/string_fortified.h":29:33 discrim 1 -1
+     (expr_list:REG_EQUAL (const_vector:VNx8BI [
+                (const_int 1 [0x1]) repeated x9
+                (const_int 0 [0])
+                (const_int 1 [0x1]) repeated x2
+                (const_int 0 [0]) repeated x4
+            ])
+        (nil)))
+during RTL pass: vregs
+```
+
+Fixes https://github.com/pytorch/pytorch/issues/157842
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/157867
+
+diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+index 7f05c2ad166f6..1632b595c4c22 100644
+--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
++++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+@@ -220,8 +220,12 @@ class Vectorized<BFloat16> {
+   Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
+ };
+ 
+-inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
+-    const Vectorized<c10::BFloat16>& a) {
++#if defined(__GNUC__) && __GNUC__ == 14
++// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
++__attribute__((optimize("no-tree-vectorize")))
++#endif
++inline std::tuple<Vectorized<float>, Vectorized<float>>
++convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
+   static_assert(
+       Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
+   auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
+diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
+index 52d5383e60f32..00c9f4eb25348 100644
+--- a/aten/src/ATen/native/cpu/Activation.cpp
++++ b/aten/src/ATen/native/cpu/Activation.cpp
+@@ -26,6 +26,10 @@ namespace at::native {
+ 
+ namespace {
+ 
++#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
++// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON
++__attribute__((optimize("no-tree-vectorize")))
++#endif
+ static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
+   if (at::isReducedFloatingType(input.scalar_type())) {
+     AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() {
+diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
+index 8ef0741e77af0..8c94decfff023 100644
+--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
++++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
+@@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last(
+ 
+ /* note: due to write issues, this one cannot be parallelized as well as
+  * unfolded2d_copy */
++#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16)
++// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16
++__attribute__((optimize("no-tree-vectorize")))
++#endif
+ void unfolded2d_acc_kernel(
+     ScalarType dtype,
+     void *finput_data,

From bf271b172bac43f93fa4204ace50d95c3de6de5f Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Mon, 5 Jan 2026 13:38:43 +0100
Subject: [PATCH 08/30] Remove already included patch

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |   6 +-
 ...yTorch-2.9.1_workaround-GCC-14-error.patch | 136 ------------------
 2 files changed, 3 insertions(+), 139 deletions(-)
 delete mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_workaround-GCC-14-error.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 8ceb58c33711..874c6dce2d85 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -31,6 +31,7 @@ patches = [
     'PyTorch-2.7.1_skip-test_data_parallel_rnn.patch',
     'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch',
     'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch',
+    'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch',
     'PyTorch-2.7.1_skip-tests-requiring-SM90.patch',
     'PyTorch-2.7.1_suport-64bit-BARs.patch',
     'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch',
@@ -55,7 +56,6 @@ patches = [
     'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch',
     'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch',
     'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
-    'PyTorch-2.9.1_workaround-GCC-14-error.patch',
 ]
 checksums = [
     {'pytorch-v2.9.1.tar.gz': 'e17504700ebc4c87f9b57059df1c4d790b769458c04db144c7a92aea90f2c92b'},
@@ -79,6 +79,8 @@ checksums = [
      '503030c3591196510a3c2d95db30b28a0b396adb8b50ff0d221f6bdb1f939935'},
     {'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch':
      '709288abc802c9eb687c15f2677ebaf408d8325a4cb470d23cb72447ee0b8e13'},
+    {'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch':
+     '4916a256b2b9914e4fdb930681b80df93ea561ddee2fc9978c4973a5650be5e9'},
     {'PyTorch-2.7.1_skip-tests-requiring-SM90.patch':
      '7b5891a96b58d1d404c130233ec5ddbb0ad52afdb9c334bbe4e1f27f6c78ffd8'},
     {'PyTorch-2.7.1_suport-64bit-BARs.patch': '317c3d220aa87426d86e137a6c1a8f910adf9580ca0848371e0f6800c05dbde1'},
@@ -120,8 +122,6 @@ checksums = [
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
     {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
      '3cf0b11136fb18c45072687eafd3024d91b504d231a4fa40e04bc62d8d6019c7'},
-    {'PyTorch-2.9.1_workaround-GCC-14-error.patch':
-     '27f5ccee07cdb5ffe134a7a50de0608a6eb8723684eb0fa5dbdba6590137bcbb'},
 ]
 
 osdependencies = [OS_PKG_IBVERBS_DEV]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_workaround-GCC-14-error.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_workaround-GCC-14-error.patch
deleted file mode 100644
index d0f7cc80f15b..000000000000
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_workaround-GCC-14-error.patch
+++ /dev/null
@@ -1,136 +0,0 @@
-From d6237721c074484ea5e72fc05614587886e57fd6 Mon Sep 17 00:00:00 2001
-From: Nikita Shulga <nshulga@meta.com>
-Date: Tue, 8 Jul 2025 18:47:20 -0700
-Subject: [PATCH] [Build] Make PyTorch compilable with gcc-14 on ARM (#157867)
-
-Fixes numerous ICEs in vreg allocations for SVE+BF16
-```
-/pytorch/aten/src/ATen/ParallelOpenMP.h:25:9: error: unrecognizable insn:
-   25 | #pragma omp parallel
-      |         ^~~
-(insn 257 256 258 30 (set (reg:VNx8BF 449 [ bf16_vec1_217 ])
-        (unspec:VNx8BF [
-                (reg:VNx8BF 455)
-                (reg:VNx8BF 456)
-            ] UNSPEC_IORF)) "/pytorch/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h":228:31 discrim 1 -1
-     (nil))
-during RTL pass: vregs
-/pytorch/aten/src/ATen/ParallelOpenMP.h:25:9: internal compiler error: in extract_insn, at recog.cc:2812
-0xd73c33 internal_error(char const*, ...)
-	???:0
-0xd73d1f fancy_abort(char const*, int, char const*)
-	???:0
-0x890053 _fatal_insn(char const*, rtx_def const*, char const*, int, char const*)
-	???:0
-0x890087 _fatal_insn_not_found(rtx_def const*, char const*, int, char const*)
-	???:0
-0x1379093 extract_insn(rtx_insn*)
-	???:0
-
-```
-And one in RTL-expand pass while compiling Activation.cpp
-```
-during RTL pass: expand
-In file included from /pytorch/aten/src/ATen/native/cpu/Activation.cpp:12,
-                 from /pytorch/build/aten/src/ATen/native/cpu/Activation.cpp.DEFAULT.cpp:1:
-/pytorch/aten/src/ATen/native/cpu/Activation.cpp: In lambda function:
-/pytorch/aten/src/ATen/native/cpu/Activation.cpp:94:7: internal compiler error: Segmentation fault
-   94 |       });
-      |       ^
-/pytorch/aten/src/ATen/Dispatch.h:201:7: note: in definition of macro 'AT_DISPATCH_SWITCH'
-  201 |       __VA_ARGS__                                                           \
-      |       ^~~~~~~~~~~
-/pytorch/aten/src/ATen/Dispatch.h:72:3: note: in expansion of macro 'AT_PRIVATE_CASE_TYPE_USING_HINT'
-   72 |   AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
-      |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-/pytorch/aten/src/ATen/Dispatch.h:214:3: note: in expansion of macro 'AT_DISPATCH_CASE'
-  214 |   AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
-      |   ^~~~~~~~~~~~~~~~
-/pytorch/aten/src/ATen/Dispatch.h:218:34: note: in expansion of macro 'AT_DISPATCH_CASE_FLOATING_TYPES'
-  218 |   AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
-      |                                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-/pytorch/aten/src/ATen/native/cpu/Activation.cpp:70:5: note: in expansion of macro 'AT_DISPATCH_FLOATING_TYPES'
-   70 |     AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&] {
-      |     ^~~~~~~~~~~~~~~~~~~~~~~~~~
-0xd73c33 internal_error(char const*, ...)
-	???:0
-0x134f987 rebuild_jump_labels(rtx_insn*)
-	???:0
-```
-
-Interestingly enough, attempt to compile `Unfold2d.cpp` for `-march=armv8-a+sve` (i.e. without sve+bf16) support also causes ICE
-```
-/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp:221:1: error: unrecognizable insn:
-  221 | }
-      | ^
-(insn 2918 2917 2919 296 (set (reg:VNx8BI 5917)
-        (unspec:VNx16BI [
-                (reg:VNx8BI 5920)
-                (reg:VNx8BI 5922)
-                (const_vector:VNx4BI [
-                        (const_int 0 [0]) repeated x8
-                    ])
-            ] UNSPEC_TRN1_CONV)) "/usr/include/aarch64-linux-gnu/bits/string_fortified.h":29:33 discrim 1 -1
-     (expr_list:REG_EQUAL (const_vector:VNx8BI [
-                (const_int 1 [0x1]) repeated x9
-                (const_int 0 [0])
-                (const_int 1 [0x1]) repeated x2
-                (const_int 0 [0]) repeated x4
-            ])
-        (nil)))
-during RTL pass: vregs
-```
-
-Fixes https://github.com/pytorch/pytorch/issues/157842
-
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/157867
-
-diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
-index 7f05c2ad166f6..1632b595c4c22 100644
---- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
-+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
-@@ -220,8 +220,12 @@ class Vectorized<BFloat16> {
-   Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
- };
- 
--inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
--    const Vectorized<c10::BFloat16>& a) {
-+#if defined(__GNUC__) && __GNUC__ == 14
-+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
-+__attribute__((optimize("no-tree-vectorize")))
-+#endif
-+inline std::tuple<Vectorized<float>, Vectorized<float>>
-+convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
-   static_assert(
-       Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
-   auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
-diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
-index 52d5383e60f32..00c9f4eb25348 100644
---- a/aten/src/ATen/native/cpu/Activation.cpp
-+++ b/aten/src/ATen/native/cpu/Activation.cpp
-@@ -26,6 +26,10 @@ namespace at::native {
- 
- namespace {
- 
-+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-+// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON
-+__attribute__((optimize("no-tree-vectorize")))
-+#endif
- static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
-   if (at::isReducedFloatingType(input.scalar_type())) {
-     AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() {
-diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
-index 8ef0741e77af0..8c94decfff023 100644
---- a/aten/src/ATen/native/cpu/Unfold2d.cpp
-+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
-@@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last(
- 
- /* note: due to write issues, this one cannot be parallelized as well as
-  * unfolded2d_copy */
-+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16)
-+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16
-+__attribute__((optimize("no-tree-vectorize")))
-+#endif
- void unfolded2d_acc_kernel(
-     ScalarType dtype,
-     void *finput_data,

From 5e4e0336d4b99bb86954351f28c3bcd3902e90a4 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Mon, 5 Jan 2026 15:05:53 +0100
Subject: [PATCH 09/30] Add missing patch

---
 ...skip-test_outside_linear_module_free.patch | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_outside_linear_module_free.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_outside_linear_module_free.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_outside_linear_module_free.patch
new file mode 100644
index 000000000000..79bdea43a4d1
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_outside_linear_module_free.patch
@@ -0,0 +1,26 @@
+Test failing with PYPI package too:
+>    self.assertTrue(cleared)
+> AssertionError: False is not true
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
+index 7541bd3b9d8..d0cb310bec6 100644
+--- a/test/dynamo/test_misc.py
++++ b/test/dynamo/test_misc.py
+@@ -10992,6 +10992,7 @@ fn
+             lambda mod: mod,
+         )
+ 
++    @unittest.skip("Unreliable")
+     def test_outside_linear_module_free(self):
+         # Compared to test_linear_module_free, the linear
+         # layer is not the code object that is directly compiled.
+@@ -11026,6 +11026,7 @@ fn
+         gc.collect()
+         self.assertTrue(cleared)
+ 
++    @unittest.skip("Unreliable")
+     def test_parameter_free(self):
+         def model_inp_ctr():
+             param = torch.nn.Parameter(torch.randn(100, 100))

From 20c68d3272c7a7e93bd567565c472c9cd4c50d23 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Wed, 7 Jan 2026 17:49:17 +0100
Subject: [PATCH 10/30] Skip tests requiring CUDA SM 9.0

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  6 +-
 ...orch-2.9.1_skip-tests-requiring-SM90.patch | 85 +++++++++++++++++++
 2 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-SM90.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 874c6dce2d85..dc399aaa34ca 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -32,7 +32,6 @@ patches = [
     'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch',
     'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch',
     'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch',
-    'PyTorch-2.7.1_skip-tests-requiring-SM90.patch',
     'PyTorch-2.7.1_suport-64bit-BARs.patch',
     'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch',
     'PyTorch-2.9.0_disable-test_nan_assert.patch',
@@ -55,6 +54,7 @@ patches = [
     'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch',
     'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch',
     'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch',
+    'PyTorch-2.9.1_skip-tests-requiring-SM90.patch',
     'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
 ]
 checksums = [
@@ -81,8 +81,6 @@ checksums = [
      '709288abc802c9eb687c15f2677ebaf408d8325a4cb470d23cb72447ee0b8e13'},
     {'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch':
      '4916a256b2b9914e4fdb930681b80df93ea561ddee2fc9978c4973a5650be5e9'},
-    {'PyTorch-2.7.1_skip-tests-requiring-SM90.patch':
-     '7b5891a96b58d1d404c130233ec5ddbb0ad52afdb9c334bbe4e1f27f6c78ffd8'},
     {'PyTorch-2.7.1_suport-64bit-BARs.patch': '317c3d220aa87426d86e137a6c1a8f910adf9580ca0848371e0f6800c05dbde1'},
     {'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch':
      'f304440a57e00b8052a5ffbf285adad8d0fdc5a812a659420b59a20deb5a9942'},
@@ -120,6 +118,8 @@ checksums = [
      '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
+    {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch':
+     '7db02152db2ae70c0fd4c4602fe381e26a74b8e4f7b16b1a3554b2353d761b10'},
     {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
      '3cf0b11136fb18c45072687eafd3024d91b504d231a4fa40e04bc62d8d6019c7'},
 ]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-SM90.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-SM90.patch
new file mode 100644
index 000000000000..4dea63b7e5fd
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-SM90.patch
@@ -0,0 +1,85 @@
+Avoid test_intra_node_comm_all_reduce failing on e.g. A100:
+
+> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721] RuntimeError: CUDA error: device-side assert triggered...
+> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721]  exiting process 1 with exit code: 10
+> ...
+> :318: st_vec: block: [0,0,0], thread: [87,0,0] Assertion `false` failed.
+> /pytorch-v2.7.1/torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h:318: st_vec: block: [0,0,0], thread: [88,0,0] Assertion `false` failed.
+
+test_fused_all_gather_scaled_matmul fails with a NCCL error due to FP8 usage and hangs forever.
+See https://github.com/pytorch/pytorch/issues/171796
+
+test_fused_scaled_matmul_reduce_scatter fails with
+> RuntimeError: torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+
+
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index 0a0f3ee4ca2..07702566fd8 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -3350,7 +3350,7 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
+     @runOnRocmArch(MI300_ARCH)
+     def test_intra_node_comm_all_reduce(self):
+         from torch._C._distributed_c10d import _get_intra_node_comm_usage_counter
+-        from torch.testing._internal.common_cuda import SM80OrLater
++        from torch.testing._internal.common_cuda import SM90OrLater
+ 
+         for peer in range(self.world_size):
+             if peer == self.rank:
+@@ -3358,8 +3358,8 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
+             if not torch._C._cuda_canDeviceAccessPeer(self.rank, peer):
+                 raise SkipTest("Test requires p2p access")
+ 
+-        if not SM80OrLater:
+-            raise SkipTest("Test requires sm>=80")
++        if not SM90OrLater:
++            raise SkipTest("Test requires sm>=90")
+ 
+         store = c10d.FileStore(self.file_name, self.world_size)
+         os.environ["ENABLE_INTRA_NODE_COMM"] = "1"
+diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
+index eeeb24bec30..9d55b620840 100644
+--- a/test/distributed/test_symmetric_memory.py
++++ b/test/distributed/test_symmetric_memory.py
+@@ -4,7 +4,7 @@ import itertools
+ import os
+ import random
+ from contextlib import nullcontext
+-from unittest import skip, skipIf
++from unittest import skip, skipIf, skipUnless
+ 
+ import torch
+ import torch.distributed as dist
+@@ -22,7 +22,7 @@ from torch.distributed._symmetric_memory import (
+     restride_A_for_fused_matmul_reduce_scatter,
+     restride_A_shard_for_fused_all_gather_matmul,
+ )
+-from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater
++from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater, IS_SM89
+ from torch.testing._internal.common_device_type import e4m3_type
+ from torch.testing._internal.common_distributed import (
+     MultiProcContinuousTest,
+@@ -399,6 +399,10 @@ class AsyncTPTest(MultiProcContinuousTest):
+ 
+     @runOnRocmArch(MI300_ARCH)
+     @skip_if_lt_x_gpu(2)
++    @skipIf(
++        not SM90OrLater,
++        "_fused_all_gather_scaled_matmul_fallback w/ FP8 only supports sm>=90",
++    )
+     @parametrize("gather_dim", [0, 1])
+     @parametrize(
+         "scale_mode", ["tensor-wise", "row-wise-replicated", "row-wise-sharded"]
+@@ -512,6 +516,10 @@ class AsyncTPTest(MultiProcContinuousTest):
+ 
+     @skipIfRocm  # AsyncTP support changed _fused_scaled_matmul_reduce_scatter_fallback API, need more changes
+     @skip_if_lt_x_gpu(2)
++    @skipUnless(
++        SM90OrLater or IS_SM89,
++        "torch._scaled_mm (from fused_scaled_matmul_reduce_scatter) only supports sm>=90 or 8.9",
++    )
+     @parametrize("scatter_dim", [0, 1])
+     @parametrize("rowwise", [True, False])
+     def test_fused_scaled_matmul_reduce_scatter(

From dc3a09e18a919e595aae89447e77b3c67aace8a0 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Wed, 7 Jan 2026 17:53:42 +0100
Subject: [PATCH 11/30] Remove old patch

---
 ...orch-2.7.1_skip-tests-requiring-SM90.patch | 34 -------------------
 1 file changed, 34 deletions(-)
 delete mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch
deleted file mode 100644
index ee60c76ddbcf..000000000000
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-Avoid it failing on e.g. A100:
-
-> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721] RuntimeError: CUDA error: device-side assert triggered...
-> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721]  exiting process 1 with exit code: 10
-> ...
-> :318: st_vec: block: [0,0,0], thread: [87,0,0] Assertion `false` failed.
-> /pytorch-v2.7.1/torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h:318: st_vec: block: [0,0,0], thread: [88,0,0] Assertion `false` failed.
-
-Author: Alexander Grund (TU Dresden)
-
-diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
-index 7410255d27a..603ea0b375b 100644
---- a/test/distributed/test_c10d_nccl.py
-+++ b/test/distributed/test_c10d_nccl.py
-@@ -3367,7 +3367,7 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
-     @skip_if_rocm_multiprocess
-     def test_intra_node_comm_all_reduce(self):
-         from torch._C._distributed_c10d import _get_intra_node_comm_usage_counter
--        from torch.testing._internal.common_cuda import SM80OrLater
-+        from torch.testing._internal.common_cuda import SM90OrLater
- 
-         for peer in range(self.world_size):
-             if peer == self.rank:
-@@ -3375,8 +3375,8 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
-             if not torch._C._cuda_canDeviceAccessPeer(self.rank, peer):
-                 raise SkipTest("Test requires p2p access")
- 
--        if not SM80OrLater:
--            raise SkipTest("Test requires sm>=80")
-+        if not SM90OrLater:
-+            raise SkipTest("Test requires sm>=90")
- 
-         store = c10d.FileStore(self.file_name, self.world_size)
-         os.environ["ENABLE_INTRA_NODE_COMM"] = "1"

From 39cf857b523cd86acf32437219fec97ae9638fd6 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 8 Jan 2026 14:15:35 +0100
Subject: [PATCH 12/30] Add patch avoiding infinite test hang

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  3 ++
 ...d-multiprocess-tests-hanging-forever.patch | 32 +++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index dc399aaa34ca..7a3774727a9b 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -53,6 +53,7 @@ patches = [
     'PyTorch-2.9.0_skip-test_unbacked_reduction.patch',
     'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch',
     'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch',
+    'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch',
     'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch',
     'PyTorch-2.9.1_skip-tests-requiring-SM90.patch',
     'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
@@ -116,6 +117,8 @@ checksums = [
      '6d79aff5291627b86d8fea025bf2379e4065c7d9cbef5cf83452c35922848728'},
     {'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch':
      '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
+    {'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch':
+     '2ce000ce59ad4157c10382ecceac263de7debab9a4db6cda5bf95038e84d0215'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
     {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch':
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch
new file mode 100644
index 000000000000..468bc2f088e9
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch
@@ -0,0 +1,32 @@
+A crashed child process in a test might cause the parent to never complete.
+Use a timeout to avoid that.
+See https://github.com/pytorch/pytorch/pull/171972
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
+index c1f75697fe8..47661c7a1fa 100644
+--- a/torch/testing/_internal/common_distributed.py
++++ b/torch/testing/_internal/common_distributed.py
+@@ -1786,8 +1786,19 @@ class MultiProcContinuousTest(TestCase):
+             if self.rank == self.MAIN_PROCESS_RANK:
+                 logger.debug(f"Waiting for workers to finish {self.id()}")  # noqa: G004
+                 # Wait for the workers to finish the test
+-                for i, completion_queue in enumerate(self.completion_queues):
+-                    rv = completion_queue.get()
++                for i, (p, completion_queue) in enumerate(
++                    zip(self.processes, self.completion_queues)
++                ):
++                    # When the process died before filling the completion queue `get` will never return.
++                    # Hence periodically check the process for liveness
++                    while True:
++                        try:
++                            rv = completion_queue.get(timeout=120)
++                        except queue.Empty:
++                            # If not alive do a last check because the timeout might have happened just before completion
++                            if not p.is_alive() and completion_queue.empty():
++                                rv = RuntimeError(f"Exited with {p.exitcode}")
++                                break
+                     if isinstance(rv, BaseException):
+                         # Hit an exception, re-raise it in the main process.
+                         logger.warning(

From caa6bf062a9c2244d8e647896eaf5043638f9648 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 15 Jan 2026 13:30:25 +0100
Subject: [PATCH 13/30] Add patch avoiding infinite test hang

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  2 +-
 ...d-multiprocess-tests-hanging-forever.patch | 47 ++++++++++++++-----
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 7a3774727a9b..125b236f078e 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -118,7 +118,7 @@ checksums = [
     {'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch':
      '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
     {'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch':
-     '2ce000ce59ad4157c10382ecceac263de7debab9a4db6cda5bf95038e84d0215'},
+     '16994db6586f213cc627b9ef141fa8a03877e3975f4aa0b87931f46ed8d03c87'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
     {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch':
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch
index 468bc2f088e9..71fc46e7ddbe 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch
@@ -8,7 +8,41 @@ diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_inte
 index c1f75697fe8..47661c7a1fa 100644
 --- a/torch/testing/_internal/common_distributed.py
 +++ b/torch/testing/_internal/common_distributed.py
-@@ -1786,8 +1786,19 @@ class MultiProcContinuousTest(TestCase):
+@@ -621,6 +621,33 @@ def cleanup_temp_dir() -> None:
+         tmp_dir.cleanup()
+ 
+ 
++def retrieve_result_from_process_queue(
++    process: torch.multiprocessing.Process,
++    queue: torch.multiprocessing.Queue,
++    timeout: Optional[int] = None,
++) -> Any:
++    """Get result from queue associated with process.
++
++    When the process finished without putting a result or the timeout expired an exception instance will be returned"""
++    queue_timeout = 120 if timeout is None else max(10, min(120, timeout // 4))
++    start_time = time.time()
++    # Periodically check the process for liveness
++    while True:
++        try:
++            return queue.get(timeout=queue_timeout)
++        except queue.Empty:
++            # If not alive do a last check because the timeout might have happened just before completion
++            if not process.is_alive() and queue.empty():
++                # Clean up process to avoid keeping a zombie process
++                process.terminate()  # Just to be sure
++                process.join(600)  # Usually completes immediately
++                return RuntimeError(f"Exited with {process.exitcode}")
++        if timeout is not None:
++            elapsed = time.time() - start_time
++            if elapsed > timeout:
++                return RuntimeError(f"Process timeout out after {elapsed}s")
++
++
+ # Most tests operate with this worldsize
+ DEFAULT_WORLD_SIZE = 4
+ 
+@@ -1786,8 +1813,10 @@ class MultiProcContinuousTest(TestCase):
              if self.rank == self.MAIN_PROCESS_RANK:
                  logger.debug(f"Waiting for workers to finish {self.id()}")  # noqa: G004
                  # Wait for the workers to finish the test
@@ -17,16 +51,7 @@ index c1f75697fe8..47661c7a1fa 100644
 +                for i, (p, completion_queue) in enumerate(
 +                    zip(self.processes, self.completion_queues)
 +                ):
-+                    # When the process died before filling the completion queue `get` will never return.
-+                    # Hence periodically check the process for liveness
-+                    while True:
-+                        try:
-+                            rv = completion_queue.get(timeout=120)
-+                        except queue.Empty:
-+                            # If not alive do a last check because the timeout might have happened just before completion
-+                            if not p.is_alive() and completion_queue.empty():
-+                                rv = RuntimeError(f"Exited with {p.exitcode}")
-+                                break
++                    rv = retrieve_result_from_process_queue(p, completion_queue)
                      if isinstance(rv, BaseException):
                          # Hit an exception, re-raise it in the main process.
                          logger.warning(

From 0a6dde086e990670d3bbbcd316aaf7d3a781e0e3 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 22 Jan 2026 16:16:33 +0100
Subject: [PATCH 14/30] Add patch avoiding infinite test hang

---
 .../p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb       | 2 +-
 ...rch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 125b236f078e..ce385c49753a 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -118,7 +118,7 @@ checksums = [
     {'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch':
      '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
     {'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch':
-     '16994db6586f213cc627b9ef141fa8a03877e3975f4aa0b87931f46ed8d03c87'},
+     '86ce380e69b3b20e010d817889cb1b825b05b4054a045b00f2ac12161b77d7e4'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
     {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch':
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch
index 71fc46e7ddbe..75e8fa00ca00 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch
@@ -14,7 +14,7 @@ index c1f75697fe8..47661c7a1fa 100644
  
 +def retrieve_result_from_process_queue(
 +    process: torch.multiprocessing.Process,
-+    queue: torch.multiprocessing.Queue,
++    completion_queue: torch.multiprocessing.Queue,
 +    timeout: Optional[int] = None,
 +) -> Any:
 +    """Get result from queue associated with process.
@@ -25,10 +25,10 @@ index c1f75697fe8..47661c7a1fa 100644
 +    # Periodically check the process for liveness
 +    while True:
 +        try:
-+            return queue.get(timeout=queue_timeout)
++            return completion_queue.get(timeout=queue_timeout)
 +        except queue.Empty:
 +            # If not alive do a last check because the timeout might have happened just before completion
-+            if not process.is_alive() and queue.empty():
++            if not process.is_alive() and completion_queue.empty():
 +                # Clean up process to avoid keeping a zombie process
 +                process.terminate()  # Just to be sure
 +                process.join(600)  # Usually completes immediately

From f093e09e2997b9f8a42772413dafd9ea2fd206a7 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 22 Jan 2026 17:26:44 +0100
Subject: [PATCH 15/30] More patches

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  9 +++
 ...yTorch-2.9.1_fix-hypothesis-deadline.patch | 67 +++++++++++++++++++
 ....9.1_fix-iteration-in-fligh-reporter.patch | 17 +++++
 ...orch-2.9.1_fix-test_dist2-decorators.patch | 64 ++++++++++++++++++
 4 files changed, 157 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-hypothesis-deadline.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index ce385c49753a..4f1aac40e4d9 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -54,6 +54,9 @@ patches = [
     'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch',
     'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch',
     'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch',
+    'PyTorch-2.9.1_fix-hypothesis-deadline.patch',
+    'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch',
+    'PyTorch-2.9.1_fix-test_dist2-decorators.patch',
     'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch',
     'PyTorch-2.9.1_skip-tests-requiring-SM90.patch',
     'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
@@ -119,6 +122,12 @@ checksums = [
      '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
     {'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch':
      '86ce380e69b3b20e010d817889cb1b825b05b4054a045b00f2ac12161b77d7e4'},
+    {'PyTorch-2.9.1_fix-hypothesis-deadline.patch':
+     'f7a130669eee9924a303df9e2bd5743ff023a7d994b7a3e43c86dcccf0206c49'},
+    {'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch':
+     'ab408275ec66e836112a50054acc4e789ef38196efeb6137c6061d60d9ac9ead'},
+    {'PyTorch-2.9.1_fix-test_dist2-decorators.patch':
+     '118e4b275aaa27c9b51d533cb2a83d74d8fc2754fed22fb30c23ba8227c03608'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
     {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch':
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-hypothesis-deadline.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-hypothesis-deadline.patch
new file mode 100644
index 000000000000..c526ea336c1d
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-hypothesis-deadline.patch
@@ -0,0 +1,67 @@
+The assertion at the bottom sometimes fails.
+
+From c4312b443fed1fd8e0e28dfe049ce61226936e99 Mon Sep 17 00:00:00 2001
+From: FFFrog <ljw1101.vip@gmail.com>
+Date: Thu, 25 Sep 2025 16:32:19 +0800
+Subject: [PATCH] [Tools] Adapting the Hypothesis library (version 5.x) for use
+ with the PyTorch framework (#163748)
+
+Starting from version 5.x, the Hypothesis library removed the timeout setting and only retained the deadline.
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/163748
+Approved by: https://github.com/albanD, https://github.com/Skylion007
+---
+ torch/testing/_internal/hypothesis_utils.py | 24 +++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+diff --git a/torch/testing/_internal/hypothesis_utils.py b/torch/testing/_internal/hypothesis_utils.py
+index f02ef4c9e04b0..a00e1e1a048a0 100644
+--- a/torch/testing/_internal/hypothesis_utils.py
++++ b/torch/testing/_internal/hypothesis_utils.py
+@@ -7,6 +7,7 @@
+ 
+ import hypothesis
+ from functools import reduce
++from importlib.metadata import version
+ from hypothesis import assume
+ from hypothesis import settings
+ from hypothesis import strategies as st
+@@ -346,22 +347,33 @@ def tensor_conv(
+ 
+     return X, W, b, groups, tr
+ 
++
+ # We set the deadline in the currently loaded profile.
+ # Creating (and loading) a separate profile overrides any settings the user
+ # already specified.
+-hypothesis_version = hypothesis.version.__version_info__
+-current_settings = settings._profiles[settings._current_profile].__dict__
+-current_settings['deadline'] = None
+-if hypothesis_version >= (3, 16, 0) and hypothesis_version < (5, 0, 0):
+-    current_settings['timeout'] = hypothesis.unlimited
++hypothesis_version = tuple(map(int, version("hypothesis").split(".")[:3]))
++
++if (3, 16, 0) <= hypothesis_version < (3, 27, 0):
++    # Hypothesis 3.16 → 3.26: use `timeout` instead of `deadline`
++    settings.register_profile("no_deadline", timeout=hypothesis.unlimited)
++else:
++    # Hypothesis >=3.27: use `deadline=None`
++    settings.register_profile("no_deadline", deadline=None)
++
++# Activate the profile
++settings.load_profile("no_deadline")
++
++
+ def assert_deadline_disabled():
++    """Check that deadlines are effectively disabled across Hypothesis versions."""
+     if hypothesis_version < (3, 27, 0):
+         import warnings
++
+         warning_message = (
+             "Your version of hypothesis is outdated. "
+             "To avoid `DeadlineExceeded` errors, please update. "
+             f"Current hypothesis version: {hypothesis.__version__}"
+         )
+-        warnings.warn(warning_message)
++        warnings.warn(warning_message, stacklevel=2)
+     else:
+         assert settings().deadline is None
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch
new file mode 100644
index 000000000000..3ff313cbe125
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch
@@ -0,0 +1,17 @@
+Avoid an error caused by modifying dict while iterating it.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/tools/flight_recorder/components/types.py b/tools/flight_recorder/components/types.py
+index 20e093688ba..98192aeb92c 100644
+--- a/tools/flight_recorder/components/types.py
++++ b/tools/flight_recorder/components/types.py
+@@ -164,7 +164,7 @@ class Database(NamedTuple):
+ # TODO: We need to add a schema for the following
+ types = [
+     TypeInfo.from_type(t)  # type: ignore[type-var]
+-    for t in globals().values()
++    for t in list(globals().values())
+     if (
+         isinstance(t, type)
+         and issubclass(t, tuple)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch
new file mode 100644
index 000000000000..9d98770ae698
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch
@@ -0,0 +1,64 @@
+The requires_gloo/requires_nccl decorator cause the function to just return.
+In the way they are used this skips the initialization done by a helper function.
+So the test is not skipped and then fails due to missing variables.
+
+Decorate the class instead.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_dist2.py b/test/distributed/test_dist2.py
+index b335eff1c21..ff5a1e8c028 100644
+--- a/test/distributed/test_dist2.py
++++ b/test/distributed/test_dist2.py
+@@ -256,10 +256,10 @@ class Dist2MultiProcessTestCase(MultiProcessTestCase):
+             self.assertEqual(merged_pg.group_name, "merged_pg")
+ 
+ 
++@requires_gloo()
+ class ProcessGroupGlooTest(Dist2MultiProcessTestCase):
+     device = torch.device("cpu")
+ 
+-    @requires_gloo()
+     def new_group(self) -> torch.distributed.ProcessGroup:
+         os.environ["RANK"] = str(self.rank)
+         os.environ["WORLD_SIZE"] = str(self.world_size)
+@@ -273,8 +273,8 @@ class ProcessGroupGlooTest(Dist2MultiProcessTestCase):
+         )
+ 
+ 
++@requires_nccl()
+ class ProcessGroupNCCLTest(Dist2MultiProcessTestCase):
+-    @requires_nccl()
+     @skip_if_lt_x_gpu(2)
+     def new_group(self) -> torch.distributed.ProcessGroup:
+         os.environ["RANK"] = str(self.rank)
+diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
+index c1f75697fe8..d513510d955 100644
+--- a/torch/testing/_internal/common_distributed.py
++++ b/torch/testing/_internal/common_distributed.py
+@@ -330,11 +330,7 @@ def with_dist_debug_levels(levels):
+     return decorator
+ 
+ 
+-def requires_gloo():
+-    return skip_but_pass_in_sandcastle_if(
+-        not c10d.is_gloo_available(),
+-        "c10d was not compiled with the Gloo backend",
+-    )
++requires_gloo = unittest.skipUnless(c10d.is_gloo_available(), "c10d was not compiled with the Gloo backend")
+ 
+ 
+ def requires_nccl_version(version, msg):
+@@ -360,11 +356,7 @@ def requires_nccl_version(version, msg):
+         return decorator
+ 
+ 
+-def requires_nccl():
+-    return skip_but_pass_in_sandcastle_if(
+-        not c10d.is_nccl_available(),
+-        "c10d was not compiled with the NCCL backend",
+-    )
++requires_nccl = unittest.skipUnless(c10d.is_nccl_available(), "c10d was not compiled with the NCCL backend")
+ 
+ 
+ def requires_ucc():

From 0ecb16a372ad5ef0922fb11d912e5d63af2ff06c Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 5 Feb 2026 14:21:56 +0100
Subject: [PATCH 16/30] Fix patched skip markers

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb        |  2 +-
 .../PyTorch-2.9.1_fix-test_dist2-decorators.patch  | 14 ++++++--------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 4f1aac40e4d9..da13a6d52ccf 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -127,7 +127,7 @@ checksums = [
     {'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch':
      'ab408275ec66e836112a50054acc4e789ef38196efeb6137c6061d60d9ac9ead'},
     {'PyTorch-2.9.1_fix-test_dist2-decorators.patch':
-     '118e4b275aaa27c9b51d533cb2a83d74d8fc2754fed22fb30c23ba8227c03608'},
+     'bf4ed805f00775ed33351de7bce40ebf4eac16aff6c61d2e91790982bc43d73b'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
     {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch':
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch
index 9d98770ae698..fffd633b4511 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch
@@ -36,29 +36,27 @@ diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_inte
 index c1f75697fe8..d513510d955 100644
 --- a/torch/testing/_internal/common_distributed.py
 +++ b/torch/testing/_internal/common_distributed.py
-@@ -330,11 +330,7 @@ def with_dist_debug_levels(levels):
-     return decorator
+@@ -331,10 +331,7 @@ def with_dist_debug_levels(levels):
  
  
--def requires_gloo():
+ def requires_gloo():
 -    return skip_but_pass_in_sandcastle_if(
 -        not c10d.is_gloo_available(),
 -        "c10d was not compiled with the Gloo backend",
 -    )
-+requires_gloo = unittest.skipUnless(c10d.is_gloo_available(), "c10d was not compiled with the Gloo backend")
++    return unittest.skipUnless(c10d.is_gloo_available(), "c10d was not compiled with the Gloo backend")
  
  
  def requires_nccl_version(version, msg):
-@@ -360,11 +356,7 @@ def requires_nccl_version(version, msg):
-         return decorator
+@@ -361,10 +358,7 @@ def requires_nccl_version(version, msg):
  
  
--def requires_nccl():
+ def requires_nccl():
 -    return skip_but_pass_in_sandcastle_if(
 -        not c10d.is_nccl_available(),
 -        "c10d was not compiled with the NCCL backend",
 -    )
-+requires_nccl = unittest.skipUnless(c10d.is_nccl_available(), "c10d was not compiled with the NCCL backend")
++    return unittest.skipUnless(c10d.is_nccl_available(), "c10d was not compiled with the NCCL backend")
  
  
  def requires_ucc():

From 3d7005bad65022bc12664809df9ce7b652832269 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 5 Feb 2026 14:24:39 +0100
Subject: [PATCH 17/30] Add comment for DISABLE_ADDR2LINE

---
 .../p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb            | 1 +
 1 file changed, 1 insertion(+)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index da13a6d52ccf..b4f5b6369c1b 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -217,6 +217,7 @@ excluded_tests = {
 }
 
 runtest = (
+    # Disable symbol resolution in stack traces that can cause hangs and slowdowns
     ' TORCH_DISABLE_ADDR2LINE=1'
     ' TORCHINDUCTOR_CUTLASS_DIR=%(start_dir)s/third_party/cutlass'
     ' PYTEST_ADDOPTS=--full-trace'

From 26ab819296606d1fec755b02d37c96631e9c6aa5 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Mon, 9 Feb 2026 15:52:11 +0100
Subject: [PATCH 18/30] Set test timeout

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  4 ++++
 .../PyTorch-2.9.1_set-test-timeout.patch      | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_set-test-timeout.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index b4f5b6369c1b..e908a1984eec 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -58,6 +58,8 @@ patches = [
     'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch',
     'PyTorch-2.9.1_fix-test_dist2-decorators.patch',
     'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch',
+    'PyTorch-2.9.1_set-test-timeout.patch',
+    'PyTorch-2.9.1_skip-bool-bessel-tests.patch',
     'PyTorch-2.9.1_skip-tests-requiring-SM90.patch',
     'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
 ]
@@ -130,6 +132,8 @@ checksums = [
      'bf4ed805f00775ed33351de7bce40ebf4eac16aff6c61d2e91790982bc43d73b'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
+    {'PyTorch-2.9.1_set-test-timeout.patch': '15fa1149c250b1333b0bc491f659aaf89d5d6eaf6df5ebc81eea545478c1239c'},
+    {'PyTorch-2.9.1_skip-bool-bessel-tests.patch': '9c07cddaf4c1b17fe9a54874f10b8edbfb85785c40ac1e3aea11c7f4b5abca69'},
     {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch':
      '7db02152db2ae70c0fd4c4602fe381e26a74b8e4f7b16b1a3554b2353d761b10'},
     {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_set-test-timeout.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_set-test-timeout.patch
new file mode 100644
index 000000000000..6bfff62d3d15
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_set-test-timeout.patch
@@ -0,0 +1,19 @@
+Some tests might hang forever and the default timeout will only be set when
+a) --enable-timeout is passed, and
+b) a `.additional_ci_files/test-times.json` exists at the root
+
+Manually set a timeout of 120min which should be enough for any single test.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/run_test.py b/test/run_test.py
+--- a/test/run_test.py
++++ b/test/run_test.py
+@@ -604,6 +604,7 @@ def run_test(
+         if is_cpp_test
+         else None
+     )
++    timeout = 60 * 120
+     print_to_stderr(f"Executing {command} ... [{datetime.now()}]")
+ 
+     with ExitStack() as stack:

From dae9b55850761fdaf9bfb93d3a262accba67998c Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Tue, 10 Feb 2026 09:02:35 +0100
Subject: [PATCH 19/30] Add GCC 14 patch

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  5 ++
 .../PyTorch-2.9.1_GCC14-ARM-workaround.patch  | 53 +++++++++++++++++++
 ...PyTorch-2.9.1_skip-bool-bessel-tests.patch | 50 +++++++++++++++++
 3 files changed, 108 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_GCC14-ARM-workaround.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-bool-bessel-tests.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index e908a1984eec..1e24641cb1fe 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -57,9 +57,11 @@ patches = [
     'PyTorch-2.9.1_fix-hypothesis-deadline.patch',
     'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch',
     'PyTorch-2.9.1_fix-test_dist2-decorators.patch',
+    'PyTorch-2.9.1_GCC14-ARM-workaround.patch',
     'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch',
     'PyTorch-2.9.1_set-test-timeout.patch',
     'PyTorch-2.9.1_skip-bool-bessel-tests.patch',
+    'PyTorch-2.9.1_skip-test_norm_matrix_degenerate_shapes.patch',
     'PyTorch-2.9.1_skip-tests-requiring-SM90.patch',
     'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
 ]
@@ -130,10 +132,13 @@ checksums = [
      'ab408275ec66e836112a50054acc4e789ef38196efeb6137c6061d60d9ac9ead'},
     {'PyTorch-2.9.1_fix-test_dist2-decorators.patch':
      'bf4ed805f00775ed33351de7bce40ebf4eac16aff6c61d2e91790982bc43d73b'},
+    {'PyTorch-2.9.1_GCC14-ARM-workaround.patch': 'ea8a8662e20fae2fb3a74c7f8bf390aba80a598ab37f9131c720d25ebb14965d'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
     {'PyTorch-2.9.1_set-test-timeout.patch': '15fa1149c250b1333b0bc491f659aaf89d5d6eaf6df5ebc81eea545478c1239c'},
     {'PyTorch-2.9.1_skip-bool-bessel-tests.patch': '9c07cddaf4c1b17fe9a54874f10b8edbfb85785c40ac1e3aea11c7f4b5abca69'},
+    {'PyTorch-2.9.1_skip-test_norm_matrix_degenerate_shapes.patch':
+     'd6082e62696a38dbfbc87c228f7ccb54dba4cfc615ce158f1f3bf77e6e30ff4f'},
     {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch':
      '7db02152db2ae70c0fd4c4602fe381e26a74b8e4f7b16b1a3554b2353d761b10'},
     {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_GCC14-ARM-workaround.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_GCC14-ARM-workaround.patch
new file mode 100644
index 000000000000..e0504c90d06b
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_GCC14-ARM-workaround.patch
@@ -0,0 +1,53 @@
+From 8fd509399e25cb4b265dff663d3f777406001f2e Mon Sep 17 00:00:00 2001
+From: Nikita Shulga <2453524+malfet@users.noreply.github.com>
+Date: Tue, 10 Feb 2026 04:35:39 +0000
+Subject: [PATCH] Blunter GCC 14.2.0 workaround for SVE compilation (#174647)
+
+Updated preprocessor directive for GCC version check and removed BF16 condition. I.e. right now SVE256 compilation with gcc-14.2 on Debian13 for ` -march=armv8-a+sve+bf16`
+
+Without the fix, compilation fails with
+```
+In file included from /home/dev/git/pytorch/pytorch/build/aten/src/ATen/native/cpu/Unfold2d.cpp.SVE256.cpp:1:
+/home/dev/git/pytorch/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp: In function 'void at::native::{anonymous}::unfolded2d_acc_kernel(c10::ScalarType, void*, void*, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, bool)':
+/home/dev/git/pytorch/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp:225:1: error: unrecognizable insn:
+  225 | }
+      | ^
+(insn 1371 1370 1372 101 (set (reg:VNx16BI 3235)
+        (unspec:VNx16BI [
+                (reg:VNx16BI 3232)
+                (reg:VNx8BI 3234)
+                (const_vector:VNx4BI [
+                        (const_int 0 [0]) repeated x8
+                    ])
+            ] UNSPEC_TRN1_CONV)) "/home/dev/git/pytorch/pytorch/torch/headeronly/util/bit_cast.h":40:14 -1
+     (nil))
+during RTL pass: vregs
+/home/dev/git/pytorch/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp:225:1: internal compiler error: in extract_insn, at recog.cc:2812
+```
+
+Not sure what compelled me to put such a narrow restriction in https://github.com/pytorch/pytorch/pull/157867
+
+Fixes https://github.com/pytorch/pytorch/issues/172630
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/174647
+Approved by: https://github.com/seemethere
+---
+ aten/src/ATen/native/cpu/Unfold2d.cpp | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
+index ed69998e99f79..9ae1391e2603e 100644
+--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
++++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
+@@ -169,8 +169,9 @@ void unfolded2d_acc_channels_last(
+ 
+ /* note: due to write issues, this one cannot be parallelized as well as
+  * unfolded2d_copy */
+-#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16)
+-// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16
++#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE)
++// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
++// NS: With or without BF16, see https://github.com/pytorch/pytorch/issues/172630
+ __attribute__((optimize("no-tree-vectorize")))
+ #endif
+ void unfolded2d_acc_kernel(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-bool-bessel-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-bool-bessel-tests.patch
new file mode 100644
index 000000000000..827601fa079e
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-bool-bessel-tests.patch
@@ -0,0 +1,50 @@
+From 08de54f1ea954a6da3b45d794972d3df3d72df02 Mon Sep 17 00:00:00 2001
+From: Rob Timpe <rtimpe@openteams.com>
+Date: Thu, 13 Nov 2025 02:23:06 +0000
+Subject: [PATCH] [3.14] Skip failing spherical_bessel_j0 tests (#167691)
+
+Starting with scipy 1.15, bool inputs error out.
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/167691
+Approved by: https://github.com/williamwen42
+---
+ .../_internal/opinfo/definitions/special.py   | 20 +++++++++++++++++++
+ 1 file changed, 20 insertions(+)
+
+diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
+index f9dc471ca98aa..47cbcb1fb4268 100644
+--- a/torch/testing/_internal/opinfo/definitions/special.py
++++ b/torch/testing/_internal/opinfo/definitions/special.py
+@@ -648,6 +648,16 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
+         dtypes=all_types_and(torch.bool),
+         ref=lambda x: scipy.special.spherical_jn(0, x) if TEST_SCIPY else None,
+         supports_autograd=False,
++        skips=(
++            DecorateInfo(
++                unittest.skip(
++                    "Scipy doesn't support bool inputs to spherical_bessel_j0"
++                ),
++                "TestUnaryUfuncs",
++                "test_reference_numerics_normal",
++                dtypes=(torch.bool,),
++            ),
++        ),
+     ),
+ ]
+ 
+@@ -768,6 +778,16 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
+                 }
+             ),
+         ),
++        skips=(
++            DecorateInfo(
++                unittest.skip(
++                    "Scipy doesn't support bool inputs to spherical_bessel_j0"
++                ),
++                "TestUnaryUfuncs",
++                "test_reference_numerics_normal",
++                dtypes=(torch.bool,),
++            ),
++        ),
+     ),
+     #
+     # Elementwise Binary Special OpInfos

From 0bb1f1a4d1ab1c6618e1e45b0945804f0351da3d Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Tue, 10 Feb 2026 09:14:03 +0100
Subject: [PATCH 20/30] Add missing patch

---
 ..._skip-test_norm_matrix_degenerate_shapes.patch | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_norm_matrix_degenerate_shapes.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_norm_matrix_degenerate_shapes.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_norm_matrix_degenerate_shapes.patch
new file mode 100644
index 000000000000..b8437e2b5bbd
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_norm_matrix_degenerate_shapes.patch
@@ -0,0 +1,15 @@
+This test no longer works with numpy >= 2.3.0
+See https://github.com/pytorch/pytorch/commit/a4a5d03779d876043b0a1f0c565659fc2298afd2
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/test_linalg.py b/test/test_linalg.py
+--- a/test/test_linalg.py
++++ b/test/test_linalg.py
+@@ -2040,6 +2040,7 @@ class TestLinalg(TestCase):
+                     run_test_case(input, ord, dim, keepdim)
+ 
+     # Test degenerate shape results match numpy for linalg.norm matrix norms
++    @unittest.skipIf(np.lib.NumpyVersion(np.__version__) >= '2.3.0', 'Numpy changed handling of degenerate inputs in 2.3.0')
+     @skipCUDAIfNoMagma
+     @skipCPUIfNoLapack
+     @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)

From c22bceea7a9d99adb103e76f26244ff0a79e518d Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 12 Feb 2026 16:35:25 +0100
Subject: [PATCH 21/30] Add patches for test fixes and skip slow&disabled tests

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   | 41 +++++++-
 ...9.1_check-device-avail-test_schedule.patch | 19 ++++
 .../PyTorch-2.9.1_disable-slow-tests.patch    | 40 ++++++++
 ...-TestExportOpInfoCPU-with-single-GPU.patch | 20 ++++
 ...fix-test_recursion_in_except_handler.patch | 34 +++++++
 .../PyTorch-2.9.1_normalize_tree_output.patch | 24 +++++
 ...ention-test_block_mask_non_divisible.patch | 17 ++++
 ...-attention-tests-on-unsupported-cpus.patch | 97 +++++++++++++++++++
 ...point_save_failure_continues_serving.patch | 28 ++++++
 ...i_head_attention_forward_cpu_float32.patch | 12 +++
 10 files changed, 329 insertions(+), 3 deletions(-)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_check-device-avail-test_schedule.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_recursion_in_except_handler.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_normalize_tree_output.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_checkpoint_save_failure_continues_serving.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 1e24641cb1fe..06569e192f03 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -9,15 +9,24 @@ PyTorch is a deep learning framework that puts Python first."""
 toolchain = {'name': 'foss', 'version': '2025b'}
 
 local_six_version = '1.11.0'
+# This is specific to a (tagged) release.
+# Extract from `get_disabled_tests` in tools/stats/import_test_stats.py
+local_disabled_tests_S3_ID = 'UsscdNP.2GMOzUxAvqIx8GAj4MuhX1Xi'
 source_urls = [GITHUB_RELEASE]
 sources = [
     '%(namelower)s-v%(version)s.tar.gz',
+    {
+        'filename': '%(name)s-%(version)s-disabled-tests.json',
+        'download_filename': f'disabled-tests-condensed.json?versionId={local_disabled_tests_S3_ID}',
+        'source_urls': ['https://ossci-metrics.s3.amazonaws.com'],
+        # See `DEFAULT_DISABLED_TESTS_FILE` in torch/testing/_internal/common_utils.py
+        'extract_cmd': 'cp %s %(builddir)s/pytorch-v%(version)s/test/.pytorch-disabled-tests.json',
+    },
     {
         # Avoid downloading this during the build, see third_party/NNPACK/cmake/DownloadSix.cmake for the version
         'filename': f'six-{local_six_version}.tar.gz',
-        'source_urls': [
-            'https://pypi.python.org/packages/16/d8/bc6316cf98419719bd59c91742194c111b6f2e85abac88e496adefaf7afe'],
-    }
+        'source_urls': ['https://pypi.python.org/packages/source/s/six'],
+    },
 ]
 patches = [
     'PyTorch-1.12.1_add-hypothesis-suppression.patch',
@@ -47,6 +56,7 @@ patches = [
     'PyTorch-2.9.0_revert-pybind11-3-change.patch',
     'PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch',
     'PyTorch-2.9.0_skip-test_convolution1-on-H100.patch',
+    'PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch',
     'PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch',
     'PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch',
     'PyTorch-2.9.0_skip-test_override-without-CUDA.patch',
@@ -54,19 +64,28 @@ patches = [
     'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch',
     'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch',
     'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch',
+    'PyTorch-2.9.1_check-device-avail-test_schedule.patch',
+    'PyTorch-2.9.1_disable-slow-tests.patch',
     'PyTorch-2.9.1_fix-hypothesis-deadline.patch',
     'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch',
+    'PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch',
     'PyTorch-2.9.1_fix-test_dist2-decorators.patch',
+    'PyTorch-2.9.1_fix-test_recursion_in_except_handler.patch',
     'PyTorch-2.9.1_GCC14-ARM-workaround.patch',
     'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch',
+    'PyTorch-2.9.1_normalize_tree_output.patch',
     'PyTorch-2.9.1_set-test-timeout.patch',
     'PyTorch-2.9.1_skip-bool-bessel-tests.patch',
+    'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch',
+    'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch',
+    'PyTorch-2.9.1_skip-test_checkpoint_save_failure_continues_serving.patch',
     'PyTorch-2.9.1_skip-test_norm_matrix_degenerate_shapes.patch',
     'PyTorch-2.9.1_skip-tests-requiring-SM90.patch',
     'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
 ]
 checksums = [
     {'pytorch-v2.9.1.tar.gz': 'e17504700ebc4c87f9b57059df1c4d790b769458c04db144c7a92aea90f2c92b'},
+    {'PyTorch-2.9.1-disabled-tests.json': '471f8aa36e056173d09ffd421ead45539a8d35fec6e61a8a0050d92a5fcd9f04'},
     {'six-1.11.0.tar.gz': '70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9'},
     {'PyTorch-1.12.1_add-hypothesis-suppression.patch':
      'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
@@ -112,6 +131,8 @@ checksums = [
      '85e236431d1a5da3fb7fccc2554640898c29f5fab46a41d15b3ab61dd1f924fc'},
     {'PyTorch-2.9.0_skip-test_convolution1-on-H100.patch':
      '704750c7cc08b58779907d608cd4b7505043e394fb27530b16d72a0dc27c277e'},
+    {'PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch':
+     'e57486cc42f3dbcae29753168febc251d070a283229e2d76ccbdf19fee53f06e'},
     {'PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch':
      '644153d4c1d8267c0631df2902a6dfe8ec2a197f3374f2a2f5654e6bd0edc05e'},
     {'PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch':
@@ -126,17 +147,31 @@ checksums = [
      '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
     {'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch':
      '86ce380e69b3b20e010d817889cb1b825b05b4054a045b00f2ac12161b77d7e4'},
+    {'PyTorch-2.9.1_check-device-avail-test_schedule.patch':
+     '64c28d38ce69147565509add36d310473ce46f14a0a876d38b5049cb7fce9817'},
+    {'PyTorch-2.9.1_disable-slow-tests.patch': '76e6d8f7366b91a0ddc65f73685f2b09988bb5537d10d294f9bb6a48c7fec3d0'},
     {'PyTorch-2.9.1_fix-hypothesis-deadline.patch':
      'f7a130669eee9924a303df9e2bd5743ff023a7d994b7a3e43c86dcccf0206c49'},
     {'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch':
      'ab408275ec66e836112a50054acc4e789ef38196efeb6137c6061d60d9ac9ead'},
+    {'PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch':
+     'bdddf5a9ba47d57ec96f4bbefc3b85c4904e44de93dc5c7a65bc03e343035ae9'},
     {'PyTorch-2.9.1_fix-test_dist2-decorators.patch':
      'bf4ed805f00775ed33351de7bce40ebf4eac16aff6c61d2e91790982bc43d73b'},
+    {'PyTorch-2.9.1_fix-test_recursion_in_except_handler.patch':
+     'e7a64dbdc202151c5bff6aac86d77b0f6e7c52dc3117e3bfe9b57ec1371f87ad'},
     {'PyTorch-2.9.1_GCC14-ARM-workaround.patch': 'ea8a8662e20fae2fb3a74c7f8bf390aba80a598ab37f9131c720d25ebb14965d'},
     {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
+    {'PyTorch-2.9.1_normalize_tree_output.patch': '7d5994580339b73c28de595d9e5a0448db97b7d284f17efd18909e4613d170df'},
     {'PyTorch-2.9.1_set-test-timeout.patch': '15fa1149c250b1333b0bc491f659aaf89d5d6eaf6df5ebc81eea545478c1239c'},
     {'PyTorch-2.9.1_skip-bool-bessel-tests.patch': '9c07cddaf4c1b17fe9a54874f10b8edbfb85785c40ac1e3aea11c7f4b5abca69'},
+    {'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch':
+     'e544f765beac7bdb3fc0ada98a3f92fd7e511ed8874de085aa2f213cca769d40'},
+    {'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch':
+     'd8489c192da549083569e09e5f94d2a83c9e41e111b1322f86512a9c5a58c0d9'},
+    {'PyTorch-2.9.1_skip-test_checkpoint_save_failure_continues_serving.patch':
+     'fa22d7ed5bf20afa4798c8af3ec732b1a3f530ecc4be5c223b3796e839b0b812'},
     {'PyTorch-2.9.1_skip-test_norm_matrix_degenerate_shapes.patch':
      'd6082e62696a38dbfbc87c228f7ccb54dba4cfc615ce158f1f3bf77e6e30ff4f'},
     {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch':
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_check-device-avail-test_schedule.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_check-device-avail-test_schedule.patch
new file mode 100644
index 000000000000..202d1e4a1fc7
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_check-device-avail-test_schedule.patch
@@ -0,0 +1,19 @@
+Some tests fail if no accelerator is available.
+> RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU [...]
+
+Check for availability to trigger CPU fallback.
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py
+index dabf3d78a6f..d3b8bf13168 100644
+--- a/test/distributed/pipelining/test_schedule.py
++++ b/test/distributed/pipelining/test_schedule.py
+@@ -53,7 +53,7 @@ from torch.testing._internal.distributed.fake_pg import FakeStore
+ 
+ ARTIFACTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "artifacts")
+ 
+-device = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
++device = acc.type if (acc := torch.accelerator.current_accelerator(check_available=True)) else "cpu"
+ logger = logging.getLogger(__name__)
+ torch.manual_seed(0)
+ 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch
new file mode 100644
index 000000000000..8f6d6e0c7677
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch
@@ -0,0 +1,40 @@
+On CI it defaults to importing JSON files with slow and disabled tests.
+Those are then skipped upon execution.
+
+Enable the default for non-CI environments to cut down testing time.
+Don't check for SANDCASTLE when determining whether to skip disabled tests.
+However, the disabled-tests JSON file needs to be downloaded from S3 and placed at "tests/.pytorch-disabled-tests.json".
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/run_test.py b/test/run_test.py
+index 44a15d4ab2c..269d4206f3e 100755
+--- a/test/run_test.py
++++ b/test/run_test.py
+@@ -531,7 +531,7 @@ def run_test(
+ 
+     # NB: These features are not available for C++ tests, but there is little incentive
+     # to implement it because we have never seen a flaky C++ test before.
+-    if IS_CI and not is_cpp_test:
++    if not is_cpp_test:
+         ci_args = ["--import-slow-tests", "--import-disabled-tests"]
+         if RERUN_DISABLED_TESTS:
+             ci_args.append("--rerun-disabled-tests")
+diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
+index bfc568bc146..7ef37cccccb 100644
+--- a/torch/testing/_internal/common_utils.py
++++ b/torch/testing/_internal/common_utils.py
+@@ -2722,11 +2722,11 @@ def check_if_enable(test: unittest.TestCase):
+         if not TEST_WITH_SLOW:
+             raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test")
+ 
+-    if not IS_SANDCASTLE:
++    if True:
+         should_skip = False
+         skip_msg = ""
+ 
+-        for disabled_test, (issue_url, platforms) in disabled_tests_dict.items():
++        for disabled_test, (pr_num, issue_url, platforms) in disabled_tests_dict.items():
+             if matches_test(disabled_test):
+                 platform_to_conditional: dict = {
+                     "mac": IS_MACOS,
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch
new file mode 100644
index 000000000000..ebdfb00e0a34
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch
@@ -0,0 +1,20 @@
+Fixes a failure on systems with a single GPU.
+Error in `init_gpu_context` (fake_tensor.py:744):
+> E           torch.AcceleratorError: CUDA error: invalid device ordinal
+
+See: https://github.com/pytorch/pytorch/pull/164184
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/export/test_export_opinfo.py b/test/export/test_export_opinfo.py
+--- a/test/export/test_export_opinfo.py
++++ b/test/export/test_export_opinfo.py
+@@ -79,7 +79,7 @@ def _test_export_helper(self, dtype, op):
+     mode = FakeTensorMode(allow_non_fake_inputs=True)
+     converter = mode.fake_tensor_converter
+     # intentionally avoid cuda:0 to flush out some bugs
+-    target_device = "cuda:1"
++    target_device = "cuda:0"
+ 
+     def to_fake_device(x):
+         x = converter.from_real_tensor(mode, x)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_recursion_in_except_handler.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_recursion_in_except_handler.patch
new file mode 100644
index 000000000000..3e807729cc56
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_recursion_in_except_handler.patch
@@ -0,0 +1,34 @@
+Fix a RecursionError inside pytest when running this test.
+See https://github.com/pytorch/pytorch/pull/174693
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/dynamo/cpython/3_13/test_exceptions.py b/test/dynamo/cpython/3_13/test_exceptions.py
+index 0ded70db3c7..bc8120a2d19 100644
+--- a/test/dynamo/cpython/3_13/test_exceptions.py
++++ b/test/dynamo/cpython/3_13/test_exceptions.py
+@@ -1573,18 +1573,18 @@ class ExceptionTests(__TestCase):
+                 recurse_in_body_and_except()
+ 
+         recursionlimit = sys.getrecursionlimit()
+-        try:
+-            set_relative_recursion_limit(10)
+-            for func in (recurse_in_except, recurse_after_except, recurse_in_body_and_except):
+-                with self.subTest(func=func):
++        for func in (recurse_in_except, recurse_after_except, recurse_in_body_and_except):
++            with self.subTest(func=func):
++                try:
++                    set_relative_recursion_limit(10)
+                     try:
+                         func()
+                     except RecursionError:
+                         pass
+                     else:
+                         self.fail("Should have raised a RecursionError")
+-        finally:
+-            sys.setrecursionlimit(recursionlimit)
++                finally:
++                    sys.setrecursionlimit(recursionlimit)
+ 
+ 
+     @cpython_only
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_normalize_tree_output.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_normalize_tree_output.patch
new file mode 100644
index 000000000000..4c708a216cbd
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_normalize_tree_output.patch
@@ -0,0 +1,24 @@
+Avoid failure in TestProfilerTree.test_profiler_experimental_tree_with_stack_and_modules
+with diff:
+> - <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
+> + <built-in method _get_tracing_state of pybind11_detail_function_record_v1_system_libstdcpp_gxx_abi_1xxx_use_cxx11_abi_1 object at 0xXXXXXXXXXXXX>
+
+See https://github.com/pytorch/pytorch/pull/174768
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
+index 670e639c98e..e53fd93b273 100644
+--- a/test/profiler/test_profiler_tree.py
++++ b/test/profiler/test_profiler_tree.py
+@@ -240,6 +240,11 @@ class TestProfilerTree(TestCase):
+         # simply coerce them into a platform independent form. If you made a
+         # change in the codebase which changes the trace produced, simply use
+         # EXPECTTEST_ACCEPT=1 to update the tests to reflect the new structure.
++        def normalize(tree):
++            return re.sub(r'of pybind11\w+ object at', 'of PyCapsule object at', tree)
++
++        actual = normalize(actual)
++        expected = normalize(expected)
+ 
+         # expecttest will not show the diff view if `len(actual) < len(expected)`
+         if not expecttest.ACCEPT:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch
new file mode 100644
index 000000000000..5e26591c68cc
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch
@@ -0,0 +1,17 @@
+This test shows segfaults, at least on some system.
+PyTorch CI HUD indicates some failures with it are known.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
+index 740faa0b375..ea5e311b7cd 100644
+--- a/test/inductor/test_flex_attention.py
++++ b/test/inductor/test_flex_attention.py
+@@ -3474,6 +3474,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
+         )
+         FileCheck().check("BLOCK_M : tl.constexpr = 16").run(code[0])
+ 
++    @unittest.skip("Segfaults on CPU")
+     @supported_platform
+     def test_block_mask_non_divisible(self, device):
+         seq = torch.arange(1023, device=device) // 128
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch
new file mode 100644
index 000000000000..a6ec831fb1c3
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch
@@ -0,0 +1,97 @@
+FlexAttention is only supported on AVX2 CPUs.
+However the tests are run on CPU unconditionally when CUDA devices are available leading to:
+> torch._inductor.exc.InductorError: LoweringException: NotImplementedError: torch.compile on current platform is not supported for CPU.
+
+Add a condition to possibly only add CUDA tests.
+See https://github.com/pytorch/pytorch/pull/174881
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
+index 740faa0b375..e698939d326 100644
+--- a/test/inductor/test_flex_attention.py
++++ b/test/inductor/test_flex_attention.py
+@@ -48,6 +48,9 @@ from torch.testing._internal.common_device_type import (
+     dtypesIfXPU,
+     flex_attention_supported_platform as supported_platform,
+     instantiate_device_type_tests,
++    IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED as TEST_ON_CPU,
++    IS_FLEX_ATTENTION_CUDA_PLATFORM_SUPPORTED as TEST_ON_CUDA,
++    IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED as TEST_ON_XPU,
+     largeTensorTest,
+     skipCPUIf,
+     skipCUDAIf,
+@@ -177,25 +180,21 @@ class DeviceConfig:
+     dtypes_fast: list[torch.dtype]
+ 
+ 
+-TEST_ON_CUDA = (
+-    torch.cuda.is_available()
+-    and torch.utils._triton.has_triton()
+-    and torch.cuda.get_device_capability() >= (8, 0)
+-)
+-TEST_ON_XPU = torch.xpu.is_available() and torch.utils._triton.has_triton()
+-
+ device_configs = {}
++# Tests are skipped when no device is supported, so CPU as default is safe
++test_device = ("cpu",)
+ if HAS_GPU:
+     if TEST_ON_CUDA:
+-        test_device = (
+-            "cuda",
+-            "cpu",
+-        )
++        if TEST_ON_CPU:
++            test_device = (
++                "cuda",
++                "cpu",
++            )
++        else:
++            test_device = ("cuda",)
+     elif TEST_ON_XPU:
+         torch._C._set_onednn_allow_tf32(True)
+         test_device = ("xpu",)
+-else:
+-    test_device = ("cpu",)
+ 
+ 
+ class SubstringSet:
+diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
+index 8971eca1bb2..6b14f9db105 100644
+--- a/torch/testing/_internal/common_device_type.py
++++ b/torch/testing/_internal/common_device_type.py
+@@ -1972,23 +1972,25 @@ def get_all_device_types() -> list[str]:
+ 
+ # skip since currently flex attention requires at least `avx2` support on CPU.
+ IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED = (
+-    not torch.xpu.is_available()
+-    and not torch.cuda.is_available()
+-    and not IS_MACOS
++    not IS_MACOS
+     and torch.cpu._is_avx2_supported()
+     and os.getenv("ATEN_CPU_CAPABILITY") != "default"
+ )
+ IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED = (
+     torch.xpu.is_available() and torch.utils._triton.has_triton()
+ )
++IS_FLEX_ATTENTION_CUDA_PLATFORM_SUPPORTED = (
++    torch.cuda.is_available()
++    and torch.utils._triton.has_triton()
++    and torch.cuda.get_device_capability() >= (8, 0)
++)
+ flex_attention_supported_platform = unittest.skipUnless(
+     IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED
+-    or IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED
+-    or (
+-        torch.cuda.is_available()
+-        and torch.utils._triton.has_triton()
+-        and torch.cuda.get_device_capability() >= (8, 0)
+-    ),
++    or (IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED
++        and not torch.xpu.is_available()
++        and not torch.cuda.is_available()
++        )
++    or IS_FLEX_ATTENTION_CUDA_PLATFORM_SUPPORTED,
+     "Requires CUDA and Triton, Intel GPU and triton, or CPU with avx2 and later",
+ )
+ if torch.version.hip and "gfx94" in torch.cuda.get_device_properties(0).gcnArchName:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_checkpoint_save_failure_continues_serving.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_checkpoint_save_failure_continues_serving.patch
new file mode 100644
index 000000000000..31e2baaf9160
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_checkpoint_save_failure_continues_serving.patch
@@ -0,0 +1,28 @@
+The test fails with
+> AssertionError: 'fail_once policy triggered failure' not found in 'cannot pickle code objects'
+
+This is caused by a change in Python 3.13 although it only worked by accident in earlier versions.
+See https://github.com/pytorch/pytorch/issues/174669
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/checkpoint/test_async_process_executor.py b/test/distributed/checkpoint/test_async_process_executor.py
+index 9dc7095b0d6..36e639803b2 100644
+--- a/test/distributed/checkpoint/test_async_process_executor.py
++++ b/test/distributed/checkpoint/test_async_process_executor.py
+@@ -1,6 +1,7 @@
+ # Owner(s): ["oncall: distributed checkpointing"]
+ 
+ import sys
++import unittest
+ from unittest.mock import patch
+ 
+ import torch
+@@ -100,6 +101,7 @@ class TestStorageWriter(StorageWriter):
+ class TestAsyncProcessExecutor(DTensorTestBase):
+     """Test suite for async checkpoint process executor error handling using public APIs."""
+ 
++    @unittest.skipIf(sys.version_info >= (3, 13), "Can't pickle tracebacks")
+     @with_comms
+     def test_checkpoint_save_failure_continues_serving(self) -> None:
+         """Test that checkpoint save failure doesn't exit process, continues serving."""
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch
new file mode 100644
index 000000000000..3c5dd5523dc5
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch
@@ -0,0 +1,12 @@
+diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
+index 8c650f6b0ce..04cfa7d4cc2 100644
+--- a/test/distributed/tensor/test_dtensor_ops.py
++++ b/test/distributed/tensor/test_dtensor_ops.py
+@@ -463,6 +463,7 @@ dtensor_fails = {
+     skip("nn.functional.feature_alpha_dropout", "without_train"),
+     skip("nn.functional.hinge_embedding_loss"),
+     skip("nn.functional.cosine_embedding_loss"),
++    skip("nn.functional.multi_head_attention_forward"),  # randomness
+     skip("fft.hfft"),
+     skip("fft.hfft2"),
+     skip("fft.hfft2"),

From 03217d9ce1fa49ba62be3ce7e11a2e1de95e9662 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 13 Feb 2026 11:07:09 +0100
Subject: [PATCH 22/30] Add PyTorch-2.6.0_fix-server-in-test_control_plane

---
 .../p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb          | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 06569e192f03..7379ad747b9e 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -33,6 +33,7 @@ patches = [
     'PyTorch-1.7.0_disable-dev-shm-test.patch',
     'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
     'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
+    'PyTorch-2.6.0_fix-server-in-test_control_plane.patch',
     'PyTorch-2.6.0_show-test-duration.patch',
     'PyTorch-2.6.0_skip-test_segfault.patch',
     'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch',
@@ -94,6 +95,8 @@ checksums = [
      '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
     {'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
      '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
+    {'PyTorch-2.6.0_fix-server-in-test_control_plane.patch':
+     '1337689ff28ecaa8d1d0edf60d322bcdd7846fec040925325d357b19eb6e4342'},
     {'PyTorch-2.6.0_show-test-duration.patch': '5508f2f9619204d9f3c356dbd4000a00d58f452ab2d64ae920eb8bc8b5484d75'},
     {'PyTorch-2.6.0_skip-test_segfault.patch': '26806bd62e6b61b56ebaa52d68ca44c415a28124f684bd2fb373557ada68ef52'},
     {'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch':

From db7aefc04c480f8887cd61d5dfc91d06a3ad34f3 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Tue, 3 Mar 2026 16:29:53 +0100
Subject: [PATCH 23/30] Fix race condition in checking for disabled tests

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  2 +-
 .../PyTorch-2.9.1_disable-slow-tests.patch    | 24 +++++++++++++++----
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 7379ad747b9e..ee2f5307c06d 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -152,7 +152,7 @@ checksums = [
      '86ce380e69b3b20e010d817889cb1b825b05b4054a045b00f2ac12161b77d7e4'},
     {'PyTorch-2.9.1_check-device-avail-test_schedule.patch':
      '64c28d38ce69147565509add36d310473ce46f14a0a876d38b5049cb7fce9817'},
-    {'PyTorch-2.9.1_disable-slow-tests.patch': '76e6d8f7366b91a0ddc65f73685f2b09988bb5537d10d294f9bb6a48c7fec3d0'},
+    {'PyTorch-2.9.1_disable-slow-tests.patch': '6b365a3531b0ac5446b5f0e8ab924b5e5742cd0331e6d9ec979118a3ef0ffc09'},
     {'PyTorch-2.9.1_fix-hypothesis-deadline.patch':
      'f7a130669eee9924a303df9e2bd5743ff023a7d994b7a3e43c86dcccf0206c49'},
     {'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch':
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch
index 8f6d6e0c7677..9db987094fff 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch
@@ -5,10 +5,14 @@ Enable the default for non-CI environments to cut down testing time.
 Don't check for SANDCASTLE when determining whether to skip disabled tests.
 However, the disabled-tests JSON file needs to be downloaded from S3 and placed at "tests/.pytorch-disabled-tests.json".
 
+This file may be modified and/or redownloaded in import_test_stats.py
+Disable this by just returning it's content as-if it is always up to date.
+If it doesn't exist the failure will be handled by the calling function.
+This modification removes the PR number field, so make it optional in the tuple expansion to allow either format.
+
 Author: Alexander Grund (TU Dresden)
 
 diff --git a/test/run_test.py b/test/run_test.py
-index 44a15d4ab2c..269d4206f3e 100755
 --- a/test/run_test.py
 +++ b/test/run_test.py
 @@ -531,7 +531,7 @@ def run_test(
@@ -20,8 +24,19 @@ index 44a15d4ab2c..269d4206f3e 100755
          ci_args = ["--import-slow-tests", "--import-disabled-tests"]
          if RERUN_DISABLED_TESTS:
              ci_args.append("--rerun-disabled-tests")
+diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
+--- a/tools/stats/import_test_stats.py
++++ b/tools/stats/import_test_stats.py
+@@ -47,6 +47,8 @@ def fetch_and_cache(
+     Path(dirpath).mkdir(exist_ok=True)
+ 
+     path = os.path.join(dirpath, name)
++    with open(path) as f:
++        return cast(dict[str, Any], json.load(f))
+     print(f"Downloading {url} to {path}")
+ 
+     def is_cached_file_valid() -> bool:
 diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
-index bfc568bc146..7ef37cccccb 100644
 --- a/torch/testing/_internal/common_utils.py
 +++ b/torch/testing/_internal/common_utils.py
 @@ -2722,11 +2722,11 @@ def check_if_enable(test: unittest.TestCase):
@@ -32,9 +47,10 @@ index bfc568bc146..7ef37cccccb 100644
 +    if True:
          should_skip = False
          skip_msg = ""
- 
+-
 -        for disabled_test, (issue_url, platforms) in disabled_tests_dict.items():
-+        for disabled_test, (pr_num, issue_url, platforms) in disabled_tests_dict.items():
++        # Allow for a potentially existing PR number
++        for disabled_test, (*pr_num, issue_url, platforms) in disabled_tests_dict.items():
              if matches_test(disabled_test):
                  platform_to_conditional: dict = {
                      "mac": IS_MACOS,

From 9fe965ec5cc5a2e6c584caecbbe4f04a7841070b Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Wed, 4 Mar 2026 15:27:44 +0100
Subject: [PATCH 24/30] Remove pytest-shard

---
 .../p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb            | 1 -
 1 file changed, 1 deletion(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index ee2f5307c06d..039f2543278e 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -192,7 +192,6 @@ builddependencies = [
     ('parameterized', '0.9.0'),
     ('pytest-flakefinder', '1.1.0'),
     ('pytest-rerunfailures', '16.1'),
-    ('pytest-shard', '0.1.2'),
     ('pytest-subtests', '0.15.0'),
     ('tlparse', '0.4.3'),
     ('optree', '0.18.0'),

From c931bb736f6d3b8553b7dea408aba1280c28761c Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 5 Mar 2026 16:26:22 +0100
Subject: [PATCH 25/30] Add more patches

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |   8 ++
 .../PyTorch-2.9.1_dont-print-test-items.patch |  24 ++++
 ....9.1_fix-DDPCommHookType-python-3.13.patch | 120 ++++++++++++++++++
 ...yTorch-2.9.1_skip-cutlass-addmm-test.patch |  16 +++
 4 files changed, 168 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_dont-print-test-items.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-DDPCommHookType-python-3.13.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cutlass-addmm-test.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 039f2543278e..b5f59f393d67 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -67,6 +67,8 @@ patches = [
     'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch',
     'PyTorch-2.9.1_check-device-avail-test_schedule.patch',
     'PyTorch-2.9.1_disable-slow-tests.patch',
+    'PyTorch-2.9.1_dont-print-test-items.patch',
+    'PyTorch-2.9.1_fix-DDPCommHookType-python-3.13.patch',
     'PyTorch-2.9.1_fix-hypothesis-deadline.patch',
     'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch',
     'PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch',
@@ -77,6 +79,7 @@ patches = [
     'PyTorch-2.9.1_normalize_tree_output.patch',
     'PyTorch-2.9.1_set-test-timeout.patch',
     'PyTorch-2.9.1_skip-bool-bessel-tests.patch',
+    'PyTorch-2.9.1_skip-cutlass-addmm-test.patch',
     'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch',
     'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch',
     'PyTorch-2.9.1_skip-test_checkpoint_save_failure_continues_serving.patch',
@@ -153,6 +156,9 @@ checksums = [
     {'PyTorch-2.9.1_check-device-avail-test_schedule.patch':
      '64c28d38ce69147565509add36d310473ce46f14a0a876d38b5049cb7fce9817'},
     {'PyTorch-2.9.1_disable-slow-tests.patch': '6b365a3531b0ac5446b5f0e8ab924b5e5742cd0331e6d9ec979118a3ef0ffc09'},
+    {'PyTorch-2.9.1_dont-print-test-items.patch': '2b524cf3d557c0672feefc3a7165e5555e549b0720647a84d546f769cea0be07'},
+    {'PyTorch-2.9.1_fix-DDPCommHookType-python-3.13.patch':
+     'd7bafe8340bba9dd909475fc62b739b0ce3f95d3409479ef8c5929351dd2a05d'},
     {'PyTorch-2.9.1_fix-hypothesis-deadline.patch':
      'f7a130669eee9924a303df9e2bd5743ff023a7d994b7a3e43c86dcccf0206c49'},
     {'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch':
@@ -169,6 +175,8 @@ checksums = [
     {'PyTorch-2.9.1_normalize_tree_output.patch': '7d5994580339b73c28de595d9e5a0448db97b7d284f17efd18909e4613d170df'},
     {'PyTorch-2.9.1_set-test-timeout.patch': '15fa1149c250b1333b0bc491f659aaf89d5d6eaf6df5ebc81eea545478c1239c'},
     {'PyTorch-2.9.1_skip-bool-bessel-tests.patch': '9c07cddaf4c1b17fe9a54874f10b8edbfb85785c40ac1e3aea11c7f4b5abca69'},
+    {'PyTorch-2.9.1_skip-cutlass-addmm-test.patch':
+     '1f81a8a9eea8eda51fc93dff84cd994772febf4fd05d77efbf21b8440dadfd4e'},
     {'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch':
      'e544f765beac7bdb3fc0ada98a3f92fd7e511ed8874de085aa2f213cca769d40'},
     {'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch':
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_dont-print-test-items.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_dont-print-test-items.patch
new file mode 100644
index 000000000000..b029f0a8a5a8
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_dont-print-test-items.patch
@@ -0,0 +1,24 @@
+Reduce verbosity of the test output by not showing all ~270k test names.
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/run_test.py b/test/run_test.py
+--- a/test/run_test.py
++++ b/test/run_test.py
+@@ -623,7 +623,7 @@ def run_test(
+                 test_file,
+             )
+         else:
+-            command.extend([f"--sc={stepcurrent_key}", "--print-items"])
++            command.extend([f"--sc={stepcurrent_key}"])
+             ret_code, was_rerun = retry_shell(
+                 command,
+                 test_directory,
+@@ -725,7 +725,7 @@ def run_test_retries(
+ 
+     num_failures = defaultdict(int)
+ 
+-    print_items = ["--print-items"]
++    print_items = []
+     sc_command = f"--sc={stepcurrent_key}"
+     while True:
+         ret_code, _ = retry_shell(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-DDPCommHookType-python-3.13.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-DDPCommHookType-python-3.13.patch
new file mode 100644
index 000000000000..85bc2949aa95
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-DDPCommHookType-python-3.13.patch
@@ -0,0 +1,120 @@
+Test on Python 3.13 fails with
+> AttributeError: 'functools.partial' object has no attribute 'value'
+
+Fix using https://github.com/pytorch/pytorch/pull/163939
+
+diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
++++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+@@ -1,7 +1,21 @@
+ # mypy: allow-untyped-defs
++import sys
+ from enum import Enum
+ from functools import partial
+ 
++
++# To suppress FutureWarning from partial since 3.13
++if sys.version_info >= (3, 13):
++    from enum import member
++
++    def _enum_member(x):
++        return member(x)
++else:
++
++    def _enum_member(x):
++        return x
++
++
+ import torch.distributed as dist
+ 
+ from . import (
+@@ -51,45 +65,61 @@ class DDPCommHookType(Enum):
+     ``DDPCommHookType.ALLREDUCE.value(model=model, state=process_group)``.
+     """
+ 
+-    ALLREDUCE = partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
+-    FP16_COMPRESS = partial(
+-        _ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook
++    ALLREDUCE = _enum_member(
++        partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
++    )
++    FP16_COMPRESS = _enum_member(
++        partial(_ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook)
+     )
+-    BF16_COMPRESS = partial(
+-        _ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook
++    BF16_COMPRESS = _enum_member(
++        partial(_ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook)
+     )
+-    QUANTIZE_PER_TENSOR = partial(
+-        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
++    QUANTIZE_PER_TENSOR = _enum_member(
++        partial(
++            _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
++        )
+     )
+-    QUANTIZE_PER_CHANNEL = partial(
+-        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
++    QUANTIZE_PER_CHANNEL = _enum_member(
++        partial(
++            _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
++        )
+     )
+-    POWER_SGD = partial(
+-        _powerSGD_comm_hook_wrapper,
+-        comm_hook=powerSGD.powerSGD_hook,
+-        matrix_approximation_rank=1,
++    POWER_SGD = _enum_member(
++        partial(
++            _powerSGD_comm_hook_wrapper,
++            comm_hook=powerSGD.powerSGD_hook,
++            matrix_approximation_rank=1,
++        )
+     )
+     # Rank-2 PowerSGD can give a higher accuracy than the default rank-1 version,
+     # but it runs slower and consumes more memory.
+-    POWER_SGD_RANK2 = partial(
+-        _powerSGD_comm_hook_wrapper,
+-        comm_hook=powerSGD.powerSGD_hook,
+-        matrix_approximation_rank=2,
++    POWER_SGD_RANK2 = _enum_member(
++        partial(
++            _powerSGD_comm_hook_wrapper,
++            comm_hook=powerSGD.powerSGD_hook,
++            matrix_approximation_rank=2,
++        )
+     )
+     # Batching can lead to a faster training at the cost of accuracy.
+-    BATCHED_POWER_SGD = partial(
+-        _powerSGD_comm_hook_wrapper,
+-        comm_hook=powerSGD.batched_powerSGD_hook,
+-        matrix_approximation_rank=1,
++    BATCHED_POWER_SGD = _enum_member(
++        partial(
++            _powerSGD_comm_hook_wrapper,
++            comm_hook=powerSGD.batched_powerSGD_hook,
++            matrix_approximation_rank=1,
++        )
+     )
+-    BATCHED_POWER_SGD_RANK2 = partial(
+-        _powerSGD_comm_hook_wrapper,
+-        comm_hook=powerSGD.batched_powerSGD_hook,
+-        matrix_approximation_rank=2,
++    BATCHED_POWER_SGD_RANK2 = _enum_member(
++        partial(
++            _powerSGD_comm_hook_wrapper,
++            comm_hook=powerSGD.batched_powerSGD_hook,
++            matrix_approximation_rank=2,
++        )
+     )
+-    NOOP = partial(
+-        _ddp_comm_hook_wrapper,
+-        comm_hook=debugging.noop_hook,
++    NOOP = _enum_member(
++        partial(
++            _ddp_comm_hook_wrapper,
++            comm_hook=debugging.noop_hook,
++        )
+     )
+ 
+ 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cutlass-addmm-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cutlass-addmm-test.patch
new file mode 100644
index 000000000000..aa7e88a859dc
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cutlass-addmm-test.patch
@@ -0,0 +1,16 @@
+The test fails with accuracy issues in at least H100, possibly on CUDA 12.8 in general.
+See https://github.com/pytorch/pytorch/pull/156626
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
+--- a/test/inductor/test_cutlass_backend.py
++++ b/test/inductor/test_cutlass_backend.py
+@@ -613,7 +613,7 @@ class TestCutlassBackend(TestCase):
+ 
+             torch.testing.assert_close(actual, expected, rtol=1e-2, atol=0.05)
+ 
+-    @unittest.skipIf(not SM90OrLater, "need sm_90")
++    @unittest.skip("Fails on CUDA 12.8+")
+     @parametrize("dynamic", (False, True))
+     @parametrize("use_aoti", (False, True))
+     @parametrize("dtype", (torch.float16, torch.bfloat16))

From c30732ff0dac84c8011b256ff6904eb99f53c589 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Wed, 11 Mar 2026 12:39:51 +0100
Subject: [PATCH 26/30] Fix using wrong OpenMP library

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  3 ++
 ...Torch-2.9.1_avoid-using-wrong-libomp.patch | 34 +++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-using-wrong-libomp.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index b5f59f393d67..b715adb4d5b1 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -65,6 +65,7 @@ patches = [
     'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch',
     'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch',
     'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch',
+    'PyTorch-2.9.1_avoid-using-wrong-libomp.patch',
     'PyTorch-2.9.1_check-device-avail-test_schedule.patch',
     'PyTorch-2.9.1_disable-slow-tests.patch',
     'PyTorch-2.9.1_dont-print-test-items.patch',
@@ -153,6 +154,8 @@ checksums = [
      '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
     {'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch':
      '86ce380e69b3b20e010d817889cb1b825b05b4054a045b00f2ac12161b77d7e4'},
+    {'PyTorch-2.9.1_avoid-using-wrong-libomp.patch':
+     '2fc2bb82cce87ba0ce73718b0502735ecdf360ca6bfac4482396f7f1c51c1866'},
     {'PyTorch-2.9.1_check-device-avail-test_schedule.patch':
      '64c28d38ce69147565509add36d310473ce46f14a0a876d38b5049cb7fce9817'},
     {'PyTorch-2.9.1_disable-slow-tests.patch': '6b365a3531b0ac5446b5f0e8ab924b5e5742cd0331e6d9ec979118a3ef0ffc09'},
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-using-wrong-libomp.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-using-wrong-libomp.patch
new file mode 100644
index 000000000000..a9b58a7d8e5c
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-using-wrong-libomp.patch
@@ -0,0 +1,34 @@
+When using GCC `libgomp.so` should be used which will be automatically done with `-fopenmp`.
+However the custom FindOpenMP searches for `libomp.so` first which ends up being used if found
+e.g. on the system in /lib64
+
+See https://github.com/pytorch/pytorch/pull/177126
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/cmake/Modules/FindOpenMP.cmake b/cmake/Modules/FindOpenMP.cmake
+--- a/cmake/Modules/FindOpenMP.cmake
++++ b/cmake/Modules/FindOpenMP.cmake
+@@ -289,21 +289,13 @@ function(_OPENMP_GET_FLAGS LANG FLAG_MODE OPENMP_FLAG_VAR OPENMP_LIB_NAMES_VAR)
+       mark_as_advanced(OpenMP_libomp_LIBRARY)
+     endif()
+ 
+-    if (NOT OpenMP_libomp_LIBRARY)
+-      find_library(OpenMP_libomp_LIBRARY
+-        NAMES omp gomp iomp5
+-        HINTS ${CMAKE_${LANG}_IMPLICIT_LINK_DIRECTORIES}
+-        DOC "libomp location for OpenMP"
+-      )
+-      mark_as_advanced(OpenMP_libomp_LIBRARY)
+-    endif()
+-
+     # Use OpenMP_PREFIX if defined
+     if (NOT OpenMP_libomp_LIBRARY AND NOT "${OpenMP_PREFIX}" STREQUAL "")
+       find_library(OpenMP_libomp_LIBRARY
+         NAMES omp gomp iomp5
+         HINTS "${OpenMP_PREFIX}/lib"
+         DOC "libomp location for OpenMP"
++        NO_DEFAULT_PATH
+       )
+       mark_as_advanced(OpenMP_libomp_LIBRARY)
+     endif()

From 3b6de18ae1969e6d23ca0b4e7a167811afc6da9f Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Wed, 11 Mar 2026 16:29:45 +0100
Subject: [PATCH 27/30] Skip segfaulting flex_attention suite

---
 .../p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb          | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index b715adb4d5b1..e453b7ec39fd 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -270,6 +270,9 @@ excluded_tests = {
         'dynamo/test_utils',
         # Packaging test only, not important for us
         'test_license',
+        # Occasional segfaults on CPU
+        'inductor/test_flex_attention',
+        'inductor/test_flex_decoding ',
     ]
 }
 

From c2dde094b8b4a6ace7f7d6033f958672b71a5194 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 13 Mar 2026 17:15:21 +0100
Subject: [PATCH 28/30] Skip some tests failing on ARM

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   | 15 ++++
 ...increase-tolerance-TestDecomp-matmul.patch | 32 +++++++++
 ..._skip-cpu_repro-tests-failing-on-ARM.patch | 72 +++++++++++++++++++
 ....1_skip-svd-pca-lowrank-tests-on-cpu.patch | 26 +++++++
 ...skip-test_optree_graph_break_message.patch | 24 +++++++
 ...ch-2.9.1_skip-tests-requiring-MKLDNN.patch | 38 ++++++++++
 6 files changed, 207 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_optree_graph_break_message.patch
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index e453b7ec39fd..665358d5cd94 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -87,6 +87,11 @@ patches = [
     'PyTorch-2.9.1_skip-test_norm_matrix_degenerate_shapes.patch',
     'PyTorch-2.9.1_skip-tests-requiring-SM90.patch',
     'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
+    'PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch',
+    'PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch',
+    'PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch',
+    'PyTorch-2.9.1_skip-test_optree_graph_break_message.patch',
+    'PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch',
 ]
 checksums = [
     {'pytorch-v2.9.1.tar.gz': 'e17504700ebc4c87f9b57059df1c4d790b769458c04db144c7a92aea90f2c92b'},
@@ -192,6 +197,16 @@ checksums = [
      '7db02152db2ae70c0fd4c4602fe381e26a74b8e4f7b16b1a3554b2353d761b10'},
     {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
      '3cf0b11136fb18c45072687eafd3024d91b504d231a4fa40e04bc62d8d6019c7'},
+    {'PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch':
+     'dd82203ce3b6262255aba6b59fb3b547c4c17875d5711f6d3d489aa8f0f59f32'},
+    {'PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch':
+     '99055fde02ca17c1db1cd72f41821387a50901d6cd947161cafa12257b3a1c5a'},
+    {'PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch':
+     '4fc772293047dc737b99e232b8a8db904aa8e88e3c8b2bcc3602fb723941fb89'},
+    {'PyTorch-2.9.1_skip-test_optree_graph_break_message.patch':
+     '2ef1ad424d5f12a4d0ae06938da623819596cee7c0fb4616008f27583c29494d'},
+    {'PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch':
+     '03756a8069bad01018f422f41aa24c7c543519fd857db88a0c6de661976c8859'},
 ]
 
 osdependencies = [OS_PKG_IBVERBS_DEV]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch
new file mode 100644
index 000000000000..9bd54ea4d8da
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch
@@ -0,0 +1,32 @@
+
+TestDecompCPU.test_comprehensive___rmatmul___cpu_float32, TestDecompCPU.test_comprehensive_matmul_cpu_float32 fail with small tolerance issues:
+>           Expected 12.534862518310547 but got 12.534895896911621.
+>           Absolute difference: 3.337860107421875e-05 (up to 1e-05 allowed)
+>           Relative difference: 2.6628613616990456e-06 (up to 1.3e-06 allowed)
+
+Increase the tolerances slightly to make them pass.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
+--- a/torch/testing/_internal/common_methods_invocations.py
++++ b/torch/testing/_internal/common_methods_invocations.py
+@@ -14286,6 +14286,9 @@ op_db: list[OpInfo] = [
+                DecorateInfo(toleranceOverride({torch.float32: tol(atol=0, rtol=1e-5)}),
+                             'TestCommon', 'test_noncontiguous_samples',
+                             device_type='cpu'),
++               DecorateInfo(toleranceOverride({torch.float32: tol(atol=4e-5, rtol=3e-6)}),
++                   "TestDecomp", "test_comprehensive", device_type="cpu",
++               ),
+                DecorateInfo(
+                    toleranceOverride({
+                        torch.float32: tol(atol=1e-5, rtol=1e-5),
+@@ -17690,6 +17693,8 @@ op_db: list[OpInfo] = [
+                             'TestMathBits', 'test_conj_view'),
+                DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1.2e-03)}),
+                             'TestCommon', 'test_noncontiguous_samples'),
++               DecorateInfo(toleranceOverride({torch.float32: tol(atol=4e-05, rtol=3e-06)}),
++                            "TestDecomp", "test_comprehensive", device_type="cpu"),
+                DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1e-05)}),
+                             "TestDecomp", "test_comprehensive", device_type="cuda",
+                             active_if=TEST_WITH_ROCM),
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch
new file mode 100644
index 000000000000..ca205deb257e
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch
@@ -0,0 +1,72 @@
+Those tests fail with precision issues on ARM which seems to be known:
+https://github.com/pytorch/pytorch/pull/171095
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
+--- a/test/inductor/test_cpu_repro.py
++++ b/test/inductor/test_cpu_repro.py
+@@ -31,6 +31,7 @@ from torch.fx.experimental.proxy_tensor import make_fx
+ from torch.nn import functional as F
+ from torch.testing._internal.common_utils import (
+     instantiate_parametrized_tests,
++    IS_ARM64,
+     IS_FBCODE,
+     IS_MACOS,
+     parametrize,
+@@ -3245,6 +3246,7 @@ class CPUReproTests(TestCase):
+                 3,
+             )
+ 
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     @config.patch({"fx_graph_cache": False, "fx_graph_remote_cache": False})
+     def test_two_local_buffers_in_outer_loop_fusion(self):
+         def fn(x):
+@@ -3568,6 +3570,7 @@ class CPUReproTests(TestCase):
+                     self.common(m, (x,))
+                     check_metrics_vec_kernel_count(6)
+ 
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     @requires_vectorization
+     @config.patch("cpp.enable_tiling_heuristics", False)
+     def test_transpose_copy(self):
+@@ -3812,6 +3815,7 @@ class CPUReproTests(TestCase):
+         self.common(fn, (x, y))
+         check_metrics_vec_kernel_count(2)
+ 
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     def test_transpose_mxn_16_16_bf16_fp16(self):
+         def fn(a, b):
+             c = a * b
+@@ -3885,6 +3889,7 @@ class CPUReproTests(TestCase):
+         x = torch.rand(4, 5)
+         self.common(f, (x,))
+ 
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     def test_broadcast_scalar_cpp_tile_2d_kernel(self):
+         # Based on detectron2_maskrcnn backbone (conv2d -> max_pool2d)
+         s0 = 12
+@@ -4384,6 +4389,7 @@ class CPUReproTests(TestCase):
+         y = torch.randint(0, 255, (3, 3), dtype=torch.uint8)
+         self.common(fn, (x, y))
+ 
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     def test_float32_to_uint8(self):
+         # https://github.com/pytorch/pytorch/issues/156788
+         @torch.compile
+@@ -4868,6 +4874,7 @@ class CPUReproTests(TestCase):
+         x = torch.randn(1, 4, 2, 2)
+         self.common(fn, (x,))
+ 
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     @parametrize("is_inference", (True, False))
+     def test_disabled_amp(self, is_inference):
+         class M(torch.nn.Module):
+@@ -5367,6 +5374,7 @@ class CPUReproTests(TestCase):
+             code
+         )
+ 
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     @config.patch(freezing=True)
+     def test_add_layernorm(self):
+         class Model(torch.nn.Module):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch
new file mode 100644
index 000000000000..f0934960ac62
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch
@@ -0,0 +1,26 @@
+On ARM those tests fail with
+> torch._C._LinAlgError: linalg.svd: The algorithm failed to converge because the input matrix contained non-finite values.
+
+Traced to OpenBLAS with a fix in OpenBLAS 0.3.30, see https://github.com/pytorch/pytorch/issues/142131
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_linalg.py b/test/test_linalg.py
+--- a/test/test_linalg.py
++++ b/test/test_linalg.py
+@@ -2674,6 +2674,7 @@ class TestLinalg(TestCase):
+         self.assertRaisesRegex(RuntimeError, "must be different", torch.norm, x, "nuc", (0, 0))
+         self.assertRaisesRegex(IndexError, "Dimension out of range", torch.norm, x, "nuc", (0, 2))
+ 
++    @onlyCUDA
+     @skipCUDAIfNoCusolver
+     @skipCPUIfNoLapack
+     @dtypes(torch.double, torch.cdouble)
+@@ -9383,6 +9384,7 @@ scipy_lobpcg  | {eq_err_scipy:10.2e}  | {eq_err_general_scipy:10.2e}  | {iters2:
+ 
+         run_test((1, 1), (1, 1, 1025))
+ 
++    @onlyCUDA
+     @skipCUDAIfNoCusolver
+     @skipCPUIfNoLapack
+     def test_pca_lowrank(self, device):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_optree_graph_break_message.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_optree_graph_break_message.patch
new file mode 100644
index 000000000000..5eec8929e5db
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_optree_graph_break_message.patch
@@ -0,0 +1,24 @@
+Test fails with output mismatch:
+> -   Explanation: Dynamo cannot trace optree C/C++ function optree._C.PyCapsule.flatten.
+> +   Explanation: Dynamo cannot trace optree C/C++ function optree._C.pybind11_detail_function_record_v1_system_libstdcpp_gxx_abi_1xxx_use_cxx11_abi_1.flatten.
+>     Hint: Consider using torch.utils._pytree - https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py
+>   
+> -   Developer debug context: module: optree._C, qualname: PyCapsule.flatten, skip reason: <missing reason>
+> +   Developer debug context: module: optree._C, qualname: pybind11_detail_function_record_v1_system_libstdcpp_gxx_abi_1xxx_use_cxx11_abi_1.flatten, skip reason: <missing reason>
+
+Seems to be related to pybind11 version, GCC version, ...
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py
+--- a/test/dynamo/test_error_messages.py
++++ b/test/dynamo/test_error_messages.py
+@@ -461,7 +461,7 @@ from user code:
+     warnings.warn("test")""",
+         )
+ 
+-    @unittest.skipIf(not python_pytree._cxx_pytree_exists, "missing optree package")
++    @unittest.skip("Failes depending on Pybind11/GCC versions")
+     def test_optree_graph_break_message(self):
+         import optree
+ 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch
new file mode 100644
index 000000000000..65cc3882ef63
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch
@@ -0,0 +1,38 @@
+test_int8_woq_mm fail without MKLDDN at
+> self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
+
+See https://github.com/pytorch/pytorch/pull/177387
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
+--- a/test/inductor/test_cpu_select_algorithm.py
++++ b/test/inductor/test_cpu_select_algorithm.py
+@@ -50,6 +50,11 @@ run_and_get_cpp_code = test_torchinductor.run_and_get_cpp_code
+ 
+ aten = torch.ops.aten
+ 
++skipIfNoMkldnn = unittest.skipIf(
++    not (torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available()),
++    "no MKLDNN",
++)
++
+ 
+ def patches(fn):
+     def skip_cache(self, choices, name, key, benchmark, hint_override=None):
+@@ -1374,6 +1379,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
+     @inductor_config.patch({"freezing": True})
+     @patches
+     @torch.no_grad
++    @skipIfNoMkldnn
+     @dtypes(torch.bfloat16)
+     @parametrize(
+         "batch_size",
+@@ -1437,6 +1443,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
+     @inductor_config.patch({"freezing": True, "cpp.enable_concat_linear": True})
+     @patches
+     @torch.no_grad
++    @skipIfNoMkldnn
+     @dtypes(torch.bfloat16)
+     @parametrize(
+         "batch_size",

From 92cb18deebd88fbbd5752559b54942b4de489f9e Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 20 Mar 2026 11:18:03 +0100
Subject: [PATCH 29/30] Fix Python 3.13 compat

---
 .../PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb   |  3 +++
 ....9.1_fix-pickle-error-on-Python-3.13.patch | 23 +++++++++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-pickle-error-on-Python-3.13.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 665358d5cd94..72bb3abaa247 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -72,6 +72,7 @@ patches = [
     'PyTorch-2.9.1_fix-DDPCommHookType-python-3.13.patch',
     'PyTorch-2.9.1_fix-hypothesis-deadline.patch',
     'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch',
+    'PyTorch-2.9.1_fix-pickle-error-on-Python-3.13.patch',
     'PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch',
     'PyTorch-2.9.1_fix-test_dist2-decorators.patch',
     'PyTorch-2.9.1_fix-test_recursion_in_except_handler.patch',
@@ -171,6 +172,8 @@ checksums = [
      'f7a130669eee9924a303df9e2bd5743ff023a7d994b7a3e43c86dcccf0206c49'},
     {'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch':
      'ab408275ec66e836112a50054acc4e789ef38196efeb6137c6061d60d9ac9ead'},
+    {'PyTorch-2.9.1_fix-pickle-error-on-Python-3.13.patch':
+     '88807b5564485968de3be6411d33c257c5ce59f5d3db23c7aeba884458102d57'},
     {'PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch':
      'bdddf5a9ba47d57ec96f4bbefc3b85c4904e44de93dc5c7a65bc03e343035ae9'},
     {'PyTorch-2.9.1_fix-test_dist2-decorators.patch':
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-pickle-error-on-Python-3.13.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-pickle-error-on-Python-3.13.patch
new file mode 100644
index 000000000000..7656cd8c5d5f
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-pickle-error-on-Python-3.13.patch
@@ -0,0 +1,23 @@
+Avoid "cannot pickle code objects" on Python 3.13+
+
+Extracted from https://github.com/pytorch/pytorch/pull/177713
+diff --git a/torch/distributed/checkpoint/api.py b/torch/distributed/checkpoint/api.py
+--- a/torch/distributed/checkpoint/api.py
++++ b/torch/distributed/checkpoint/api.py
+@@ -8,7 +8,15 @@ __all__ = ["CheckpointException"]
+ 
+ 
+ def _wrap_exception(exc: BaseException) -> WRAPPED_EXCEPTION:
+-    return (exc, tb.extract_tb(exc.__traceback__))
++    summary = tb.extract_tb(exc.__traceback__)
++    # Python 3.13+ stores bytecode objects in FrameSummary._code,
++    # which cannot be pickled. Clear them so gather_object succeeds
++    # and the real exception is reported instead of a misleading
++    # "cannot pickle code objects" TypeError.
++    for frame in summary:
++        if hasattr(frame, "_code"):
++            object.__setattr__(frame, "_code", None)
++    return (exc, summary)
+ 
+ 
+ def _is_wrapped_exception(obj: Any) -> bool:

From e83332d2d2755bf68e90e829756f2fbf17ae67c0 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 16 Apr 2026 18:04:01 +0200
Subject: [PATCH 30/30] Disable sanity_check_pip_list

---
 .../p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb           | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
index 72bb3abaa247..e38d4e84ff16 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
@@ -314,4 +314,6 @@ modextrapaths = {'TORCHINDUCTOR_CUTLASS_DIR': 'extra/cutlass'}
 
 tests = ['PyTorch-check-cpp-extension.py', 'PyTorch-check-cutlass.py']
 
+sanity_check_pip_list = False
+
 moduleclass = 'ai'