diff --git a/easybuild/easyconfigs/c/cuDNN/cuDNN-9.10.2.21-CUDA-12.6.0.eb b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.10.2.21-CUDA-12.6.0.eb
new file mode 100644
index 00000000000..427e5309a94
--- /dev/null
+++ b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.10.2.21-CUDA-12.6.0.eb
@@ -0,0 +1,34 @@
+name = 'cuDNN'
+version = '9.10.2.21'
+versionsuffix = '-CUDA-%(cudaver)s'
+homepage = 'https://developer.nvidia.com/cudnn'
+description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is
+a GPU-accelerated library of primitives for deep neural networks."""
+
+toolchain = SYSTEM
+
+source_urls = [
+    'https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-%(cudnnarch)s/'
+]
+# note: cuDNN is tied to specific to CUDA versions,
+# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions
+sources = ['%(namelower)s-linux-%(cudnnarch)s-%(version)s_cuda%(cudamajver)s-archive.tar.xz']
+checksums = [{
+    '%(namelower)s-linux-sbsa-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
+        '4d57dceba3be27a68b078ce8630525bf40ab7f1b546eb45d0b363c3eeb55f8fa',
+    '%(namelower)s-linux-x86_64-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
+        'd0defcbc4c6dad711ff4cb66d254036a300c9071b07c7b64199aacab534313c1',
+}]
+
+dependencies = [('CUDA', '12.6.0')]
+
+sanity_check_paths = {
+    'files': [
+        'include/cudnn.h', 'lib64/libcudnn_adv_static.a', 'lib64/libcudnn_cnn_static.a',
+        'lib64/libcudnn_engines_precompiled_static.a', 'lib64/libcudnn_engines_runtime_compiled_static.a',
+        'lib64/libcudnn_graph_static.a', 'lib64/libcudnn_heuristic_static.a', 'lib64/libcudnn_ops_static.a',
+    ],
+    'dirs': ['include', 'lib64'],
+}
+
+moduleclass = 'numlib'
diff --git a/easybuild/easyconfigs/n/NCCL/NCCL-2.27.5-GCCcore-13.3.0-CUDA-12.6.0.eb b/easybuild/easyconfigs/n/NCCL/NCCL-2.27.5-GCCcore-13.3.0-CUDA-12.6.0.eb
new file mode 100644
index 00000000000..4b1bd8f94a1
--- /dev/null
+++ b/easybuild/easyconfigs/n/NCCL/NCCL-2.27.5-GCCcore-13.3.0-CUDA-12.6.0.eb
@@ -0,0 +1,26 @@
+name = 'NCCL'
+version = '2.27.5'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://developer.nvidia.com/nccl'
+description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
+communication primitives that are performance optimized for NVIDIA GPUs."""
+
+toolchain = {'name': 'GCCcore', 'version': '13.3.0'}
+
+github_account = 'NVIDIA'
+source_urls = [GITHUB_SOURCE]
+sources = ['v%(version)s-1.tar.gz']
+checksums = ['e8a8972fc7f7517703510ef23608d41f6484db5331fca37827b4af3f66995344']
+
+builddependencies = [('binutils', '2.42')]
+
+dependencies = [
+    ('CUDA', '12.6.0', '', SYSTEM),
+    ('UCX-CUDA', '1.16.0', versionsuffix),
+]
+
+# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
+cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0']
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch
new file mode 100644
index 00000000000..0f60a483e5a
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch
@@ -0,0 +1,57 @@
+Disable a test that has incomplete skip condition.
+See https://github.com/pytorch/pytorch/pull/167971
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index 0a0f3ee4ca2..aff8ba0156f 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -11,6 +11,7 @@ import sys
+ import tempfile
+ import threading
+ import time
++import unittest
+ import warnings
+ from contextlib import contextmanager
+ from datetime import datetime, timedelta
+@@ -295,12 +296,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
+         # But if we are in Sandcastle, `skip_but_pass_in_sandcastle` would return 0.
+         TEST_NAN_ASSERT_RETURN = 0 if IS_SANDCASTLE else signal.SIGABRT
+         self.special_return_code_checks = {
+-            self.test_nan_assert_float16.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float32.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float64.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_bfloat16.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float8_e4m3fn.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float8_e5m2.__wrapped__: TEST_NAN_ASSERT_RETURN,
++
+         }
+ 
+         # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
+@@ -489,24 +485,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
+         torch.version.cuda is not None and int(torch.version.cuda.split(".")[0]) >= 12
+     )
+ 
+-    @requires_nccl()
+-    @skip_but_pass_in_sandcastle_if(
+-        # skip for cu126 as well due to https://github.com/pytorch/pytorch/issues/153479
+-        not (TEST_MULTIGPU and CUDA_12_AND_ABOVE),
+-        "NCCL test requires 2+ GPUs and Device side assert could cause unexpected errors in lower versions of CUDA",
+-    )
+-    @parametrize(
+-        "type",
+-        [
+-            torch.float16,
+-            torch.float32,
+-            torch.float64,
+-            torch.bfloat16,
+-            torch.float8_e4m3fn,
+-            torch.float8_e5m2,
+-        ],
+-    )
+-    @skip_if_rocm_multiprocess
++    @unittest.skip("Wrong conditions")
+     def test_nan_assert(self, type):
+         # Expecting a device-side error when NaN is detected
+         os.environ["TORCH_NCCL_NAN_CHECK"] = "1"
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch
new file mode 100644
index 00000000000..5c35b586ac8
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch
@@ -0,0 +1,28 @@
+CudaGraphTreeTests.test_workspace_allocation_error fails if TORCH_DISABLE_ADDR2LINE=1 is set
+> File "/pytorch-v2.9.0/test/inductor/test_cudagraph_trees.py", line 1568, in test_workspace_allocation_error
+>     self.assertTrue(
+> AssertionError: False is not true
+
+See https://github.com/pytorch/pytorch/issues/103369
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
+--- a/test/inductor/test_cudagraph_trees.py
++++ b/test/inductor/test_cudagraph_trees.py
+@@ -5,6 +5,7 @@ import functools
+ import gc
+ import importlib
+ import itertools
++import os
+ import re
+ import sys
+ import unittest
+@@ -1543,6 +1544,7 @@ if HAS_CUDA_AND_TRITON:
+         @skipIfRocm
+         @unittest.skipUnless(IS_X86 and IS_LINUX, "cpp contexts are linux only")
+         @torch._inductor.config.patch("triton.cudagraph_trees_history_recording", True)
++        @unittest.mock.patch.dict(os.environ, {"TORCH_DISABLE_ADDR2LINE": "0"})
+         def test_workspace_allocation_error(self):
+             torch._C._cuda_clearCublasWorkspaces()
+ 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch
new file mode 100644
index 00000000000..0bf2d29a745
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch
@@ -0,0 +1,28 @@
+Many tests using Float16 on CPU fail with reference_in_float=False
+See https://github.com/pytorch/pytorch/issues/169809
+
+E.g.:
+> TestInductorOpInfoCPU.test_comprehensive_grid_sampler_2d_cpu_float16
+> [...]
+> Mismatched elements: 125 / 780 (16.0%)
+> Greatest absolute difference: 0.02001953125 at index (0, 1, 3, 2) (up to 1e-05 allowed)
+> Greatest relative difference: 2.34375 at index (1, 1, 2, 4) (up to 0.001 allowed)
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
+index 807ccb48a79..7e5740e0177 100644
+--- a/test/inductor/test_torchinductor_opinfo.py
++++ b/test/inductor/test_torchinductor_opinfo.py
+@@ -1329,8 +1329,10 @@ class TestInductorOpInfo(TestCase):
+                         # Triton
+                         if has_triton():
+                             adjusted_kwargs.update(
+-                                copy_to_gpu=False, reference_in_float=False
++                                copy_to_gpu=False,
+                             )
++                            if device_type == GPU_TYPE:
++                                adjusted_kwargs['reference_in_float'] = False
+ 
+                         # skip checking gradient on CPU for now
+                         if device_type == GPU_TYPE:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch
new file mode 100644
index 00000000000..851ac1f34bd
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch
@@ -0,0 +1,59 @@
+From d55c9d52cda889850484968fc55ee73bf40540ec Mon Sep 17 00:00:00 2001
+From: Chien-Chin Huang <chienchin@fb.com>
+Date: Wed, 17 Sep 2025 18:14:51 -0700
+Subject: [PATCH] [CP] Fix cuDNN CP LSE dimension bug (#163231)
+
+We should only unsqueeze if necessary.
+
+Fix https://github.com/pytorch/pytorch/issues/162743
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/163231
+Approved by: https://github.com/eqy
+ghstack dependencies: #162539, #162540, #162541, #163115, #163131
+---
+ .../tensor/experimental/_attention.py          | 18 +++++++++++++++---
+ 1 file changed, 15 insertions(+), 3 deletions(-)
+
+diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
+index 6336967582429..a3345f37a170d 100644
+--- a/torch/distributed/tensor/experimental/_attention.py
++++ b/torch/distributed/tensor/experimental/_attention.py
+@@ -134,6 +134,7 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
+         self._seq_dim = seq_dim
+         self._out: Optional[torch.Tensor] = None
+         self._lse: Optional[torch.Tensor] = None
++        self._should_lse_squeeze = False
+         self._convert_to_f32 = convert_to_f32
+         self._out_dtype = torch.float32
+         self._lse_dtype = torch.float32
+@@ -141,7 +142,14 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
+     def _merge_one(
+         self, block_out: torch.Tensor, block_lse: torch.Tensor, partial: bool
+     ) -> None:
+-        block_lse = block_lse.unsqueeze(dim=-1)
++        # The cuDNN backend preserves the last dimension for LSE.
++        # Apply unsqueeze only if the input does not already have
++        # the required dimensionality.
++        if len(block_lse.shape) < len(block_out.shape):
++            block_lse = block_lse.unsqueeze(dim=-1)
++            self._should_lse_squeeze = True
++        assert len(block_lse.shape) == len(block_out.shape)
++
+         if self._lse is None:
+             self._lse = block_lse
+             self._out = block_out
+@@ -199,8 +207,12 @@ def step(self, out: torch.Tensor, lse: torch.Tensor, partial: bool) -> None:
+     def results(self) -> tuple[torch.Tensor, torch.Tensor]:
+         assert self._out is not None
+         assert self._lse is not None
+-        out, lse = self._out, self._lse.squeeze(-1)
+-        return out.to(self._out_dtype), lse.to(self._lse_dtype)
++        out = self._out.to(self._out_dtype)
++        if self._should_lse_squeeze:
++            lse = self._lse.squeeze(-1).to(self._lse_dtype)
++        else:
++            lse = self._lse.to(self._lse_dtype)
++        return out, lse
+ 
+ 
+ class _AttentionOp(Protocol):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch
new file mode 100644
index 00000000000..248d6d934b7
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch
@@ -0,0 +1,55 @@
+From 6702f545d880fd82700811e4a3508cdd76da9a69 Mon Sep 17 00:00:00 2001
+From: Alexander Grund <alexander.grund@tu-dresden.de>
+Date: Tue, 16 Sep 2025 17:37:06 +0000
+Subject: [PATCH] Restore environment after NcclUserBufferRegistrationTest
+ (#163063)
+
+This test sets "NCCL_ALGO=NVLS" in NcclUserBufferRegistrationTest which affects tests run in the same process such as `test_on_completion_hook_*` that fail with
+> invalid usage (run with NCCL_DEBUG=WARN for details), NCCL version 2.26.2
+> ncclInvalidUsage: This usually reflects invalid usage of NCCL library.
+> Last error:
+> Error : no algorithm/protocol available for function Broadcast with datatype ncclInt8. NCCL_ALGO was set to NVLS.
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/163063
+Approved by: https://github.com/ezyang
+---
+ test/distributed/test_c10d_nccl.py | 21 +++++++++++++--------
+ 1 file changed, 13 insertions(+), 8 deletions(-)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index 0d55845228da..f44394e3148c 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -3145,19 +3145,24 @@ def test_invalid_nccl_blocking_wait_env(self):
+ class NcclUserBufferRegistrationTest(MultiProcessTestCase):
+     def setUp(self):
+         super().setUp()
+-        # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
+-        # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
+-        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+         nccl_debug_file = tempfile.NamedTemporaryFile()
+-        os.environ["NCCL_ALGO"] = "NVLS"
+-        os.environ["NCCL_DEBUG"] = "INFO"
+-        os.environ["NCCL_DEBUG_SUBSYS"] = "NVLS"
++        nccl_env = {
++            # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
++            # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
++            "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1",
++            "NCCL_ALGO": "NVLS",
++            "NCCL_DEBUG": "INFO",
++            "NCCL_DEBUG_SUBSYS": "NVLS",
++            "NCCL_DEBUG_FILE": nccl_debug_file.name,
++        }
+         if torch.cuda.nccl.version() >= (2, 24, 3):
+-            os.environ["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
+-        os.environ["NCCL_DEBUG_FILE"] = nccl_debug_file.name
++            nccl_env["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
++        self.env_patcher = mock.patch.dict(os.environ, nccl_env)
++        self.env_patcher.start()
+         self._spawn_processes()
+ 
+     def tearDown(self):
++        self.env_patcher.stop()
+         super().tearDown()
+         try:
+             os.remove(self.file_name)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch
new file mode 100644
index 00000000000..b74d565bc51
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch
@@ -0,0 +1,33 @@
+PadMMTest.test_exclude_padding fails on H100 with
+>     self.assertTrue(len(local_cache) == 2)
+> AssertionError: False is not true
+
+Increasing the size triggers the intended code.
+See https://github.com/pytorch/pytorch/pull/169177
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
+--- a/test/inductor/test_pad_mm.py
++++ b/test/inductor/test_pad_mm.py
+@@ -425,7 +426,10 @@ class PadMMTest(TestCase):
+         def mm(a, b):
+             return a @ b
+ 
+-        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
++        # Size must be big enough such that `is_mm_compute_bound` returns True and we need padding to 4 elements
++        # machine balance is ~8.3 (A100), 14.1 (H100), size must be 3x that, see arithmetic_intensity for M=N=K
++        size = [59, 59]
++        mm(torch.rand(size, device="cuda"), torch.rand(size, device="cuda"))
+         local_cache = get_pad_cache().get_local_cache()
+         self.assertTrue(len(local_cache) == 2)
+         FileCheck().check_count("exclude_pad:False", 2, exactly=True).run(
+@@ -436,7 +440,7 @@ class PadMMTest(TestCase):
+         def mm(a, b):
+             return (a + 1) @ b
+ 
+-        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
++        mm(torch.rand(size, device="cuda"), torch.rand(size, device="cuda"))
+         local_cache = get_pad_cache().get_local_cache()
+         # reuse original base timing
+         self.assertTrue(len(local_cache) == 3)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch
new file mode 100644
index 00000000000..819b8577356
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch
@@ -0,0 +1,27 @@
+TestSaveLoad.test_version_error causes a failure due to TEMPDIR being set by EasyBuild:
+
+> Ran into the following error when deserializing: [enforce fail at inline_container.cc:332] . file in archive is not in a subdirectory tmpi40i4vmn/: easybuild-tmp/archive_version
+
+Fix the code to handle that, see https://github.com/pytorch/pytorch/pull/169936
+
+diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
+index faef9b455a0..e3a463014fb 100644
+--- a/test/export/test_serialize.py
++++ b/test/export/test_serialize.py
+@@ -7,6 +7,7 @@ with test_sym_bool)
+ import copy
+ import io
+ import math
++import os
+ import tempfile
+ import unittest
+ import zipfile
+@@ -1915,7 +1916,7 @@ class TestSaveLoad(TestCase):
+             with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+                 save(ep, f.name)
+                 f.seek(0)
+-                file_prefix = f.name.split("/")[2].split(".")[0]
++                file_prefix = os.path.splitext(os.path.basename(f.name))[0]
+ 
+                 # Create a new file and copy things over, but modify the
+                 # archive version
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch
new file mode 100644
index 00000000000..e2a096dd8b9
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch
@@ -0,0 +1,29 @@
+Avoid PyTorch trying to use $HOME if XDG_CACHE_HOME is set.
+See https://github.com/pytorch/pytorch/pull/168232
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
+--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
++++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
+@@ -36,8 +36,18 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
+   if (writer_ == nullptr) {
+     // Attempt to write to running user's HOME directory cache folder - if it
+     // exists.
+-    auto homeDir = getCvarString({"HOME"}, "/tmp");
+-    auto cacheDirPath = std::filesystem::path(homeDir + "/.cache/torch");
++    #ifdef _WIN32
++        const char* cacheHome = nullptr;
++    #else
++        // Uses XDG_CACHE_HOME if it's set
++        const char* cacheHome = std::getenv("XDG_CACHE_HOME");
++    #endif
++    std::string cacheRoot;
++    if (cacheHome)
++      cacheRoot = cacheHome;
++    else
++      cacheRoot = getCvarString({"HOME"}, "/tmp") + "/.cache";
++    auto cacheDirPath = std::filesystem::path(cacheRoot + "/torch");
+     // Create the .cache directory if it doesn't exist
+     std::filesystem::create_directories(cacheDirPath);
+     auto defaultLocation = cacheDirPath / "comm_lib_trace_rank_";
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch
new file mode 100644
index 00000000000..76180cb4481
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch
@@ -0,0 +1,21 @@
+When not using Intel MKL this shows a tolerance error in
+TestSDPACpuOnlyCPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_17_n_head_1_head_dim_8_mask_dim_2_bool_mask_True_train_True_casual_False_set_attn_mask_True_cpu_float32
+
+>    self.assertEqual(grad_k_actual, grad_k_ref, atol=tol_grad.atol, rtol=tol_grad.rtol)
+> Mismatched elements: 1 / 1632 (0.1%)
+> Greatest absolute difference: 1.245737075805664e-05 at index (9, 0, 15, 1) (up to 1e-05 allowed)
+> Greatest relative difference: 5.157565828994848e-05 at index (9, 0, 15, 1) (up to 5e-06 allowed)
+
+diff --git a/test/test_transformers.py b/test/test_transformers.py
+index 5b240e1f046..2e1b4091d35 100644
+--- a/test/test_transformers.py
++++ b/test/test_transformers.py
+@@ -2153,6 +2153,8 @@ class TestSDPACpuOnly(NNTestCase):
+             tol_grad = Tolerances(5e-2, 5e-2)
+         if dtype is torch.float16:
+             tol_grad = Tolerances(1e-1, 1e-1)
++        if dtype is torch.float32:
++            tol_grad = Tolerances(1.3e-5, 5.2e-5)
+         for mask_shape in itertools.product(
+             [q_seq_len, 1], [kv_seq_len, 1]
+         ) if mask_dim == 2 else itertools.product(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_readd-support-for-nvidia-cutlass-python-package.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_readd-support-for-nvidia-cutlass-python-package.patch
new file mode 100644
index 00000000000..0e2848280d1
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_readd-support-for-nvidia-cutlass-python-package.patch
@@ -0,0 +1,124 @@
+Allow use of the NVIDIA CUTLASS Python package if installed.
+See https://github.com/pytorch/pytorch/pull/160180
+
+Author: Alexander Grund (TU Dresden)
+
+diff -ur a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
+--- a/torch/_inductor/codecache.py	2025-10-15 19:15:08.000000000 +0200
++++ b/torch/_inductor/codecache.py	2025-10-24 18:07:49.519431015 +0200
+@@ -3628,13 +3628,15 @@
+     return "nvcc"
+ 
+ 
+-def _cutlass_path() -> str:
++def _cutlass_path() -> Optional[str]:
+     if config.is_fbcode():
+         from libfb.py import parutil
+ 
+         return parutil.get_dir_path("cutlass-4-headers")
+     else:
+-        return config.cuda.cutlass_dir
++        from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
++
++        return config.cuda.cutlass_dir if try_import_cutlass() else None
+ 
+ 
+ def _cutlass_paths() -> list[str]:
+@@ -3649,6 +3651,8 @@
+ def _clone_cutlass_paths(build_root: str) -> list[str]:
+     paths = _cutlass_paths()
+     cutlass_root = _cutlass_path()
++    if cutlass_root is None:
++        return []
+     for path in _cutlass_paths():
+         old_path = os.path.join(cutlass_root, path)
+         new_path = os.path.join(build_root, path)
+@@ -3657,10 +3661,12 @@
+ 
+ 
+ def _cutlass_include_paths() -> list[str]:
+-    cutlass_path = _cutlass_path()
++    cutlass_root = _cutlass_path()
++    if cutlass_root is None:
++        return []
+     return [
+         # Use realpath to get canonical absolute paths, in order not to mess up cache keys
+-        os.path.realpath(os.path.join(cutlass_path, path))
++        os.path.realpath(os.path.join(cutlass_root, path))
+         for path in _cutlass_paths()
+     ]
+ 
+diff -ur a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py
+--- a/torch/_inductor/codegen/cuda/cutlass_utils.py	2025-10-15 19:15:08.000000000 +0200
++++ b/torch/_inductor/codegen/cuda/cutlass_utils.py	2025-10-24 18:07:49.520431003 +0200
+@@ -1,6 +1,7 @@
+ # mypy: allow-untyped-defs
+ import atexit
+ import functools
++import importlib.metadata
+ import logging
+ import os
+ import shutil
+@@ -15,6 +16,7 @@
+ import torch
+ from torch._inductor.runtime.runtime_utils import dynamo_timed
+ from torch._inductor.utils import clear_on_fresh_cache
++from torch._vendor.packaging.version import Version
+ from torch.utils._ordered_set import OrderedSet
+ 
+ from ... import config
+@@ -73,7 +75,9 @@
+     """
+     We want to support three ways of passing in CUTLASS:
+     1. fbcode, handled by the internal build system.
+-    2. User specifies cutlass_dir. The default is ../third_party/cutlass/,
++    2. pip install nvidia-cutlass, which provides the cutlass_library package
++       and the header files in the cutlass_library/source directory.
++    3. User specifies cutlass_dir. The default is ../third_party/cutlass/,
+        which is the directory when developers build from source.
+     """
+     if config.is_fbcode():
+@@ -89,6 +93,34 @@
+ 
+         return True
+ 
++    try:
++        cutlass_version = Version(importlib.metadata.version("cutlass"))
++        if cutlass_version < Version("3.7"):
++            log.warning("CUTLASS version < 3.7 is not recommended.")
++
++        import cutlass_library  # type: ignore[import-not-found]  # noqa: F811
++
++        log.debug(
++            "Found cutlass_library in python search path, overriding config.cuda.cutlass_dir"
++        )
++        cutlass_library_dir = os.path.dirname(cutlass_library.__file__)
++        assert os.path.isdir(cutlass_library_dir), (
++            f"{cutlass_library_dir} is not a directory"
++        )
++        config.cuda.cutlass_dir = os.path.abspath(
++            os.path.join(
++                cutlass_library_dir,
++                "source",
++            )
++        )
++
++        return True
++    except (ModuleNotFoundError, importlib.metadata.PackageNotFoundError):
++        log.debug(
++            "cutlass_library not found in sys.path, trying to import from config.cuda.cutlass_dir",
++            exc_info=True,
++        )
++
+     # Copy CUTLASS python scripts to a temp dir and add the temp dir to Python search path.
+     # This is a temporary hack to avoid CUTLASS module naming conflicts.
+     # TODO(ipiszy): remove this hack when CUTLASS solves Python scripts packaging structure issues.
+@@ -156,7 +188,7 @@
+                 )
+ 
+         try:
+-            import cutlass  # noqa: F401, F811
++            import cutlass  # noqa: F401
+             import cutlass_library.generator  # noqa: F401
+             import cutlass_library.library  # noqa: F401
+             import cutlass_library.manifest  # noqa: F401
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_remove-faulty-close.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_remove-faulty-close.patch
new file mode 100644
index 00000000000..0eeea901157
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_remove-faulty-close.patch
@@ -0,0 +1,48 @@
+commit d3d62ad44284abff4fcd0c70e245739c976bf5e1
+Author: Alexander Grund <alexander.grund@tu-dresden.de>
+Date:   Tue Nov 25 13:54:26 2025 +0100
+
+    Avoid closing random file handles in Inductor
+    
+    `CppCodeCache.load` returns a `ctypes.CDLL`.
+    That does not have a (Python class) `close` function so calling
+    `self.DLL.close()` calls whatever C function with name `close` happens
+    to exist. This is usually the glibc `close` that closes (file) handles.
+    As the argument is missing it closes whatever happens to be in the
+    register at that point.
+    
+    In some tests this seems to close "fd=1", i.e. stdout. Sebsequent
+    writes/print then fails with
+    > OSError: [Errno 9] Bad file descriptor
+    
+    Simply remove the `close` call for now.
+
+diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
+index 1d1687141fb..66b741fafe2 100644
+--- a/torch/_inductor/autotune_process.py
++++ b/torch/_inductor/autotune_process.py
+@@ -882,14 +882,6 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest):
+             *self.extra_args,
+         )
+ 
+-    def cleanup_run_fn(self) -> None:
+-        if self.DLL is not None:
+-            """
+-            Check close attr due to it crash on Windows.
+-            """
+-            if hasattr(self.DLL, "close"):
+-                self.DLL.close()
+-
+     def __str__(self) -> str:
+         return f"{self.kernel_name=}"
+ 
+@@ -939,9 +931,6 @@ class CuteDSLBenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
+ 
+         return run_kernel
+ 
+-    def cleanup_run_fn(self) -> None:
+-        """Clean up any resources used by the kernel."""
+-
+ 
+ @functools.cache
+ def get_tuning_process_pool() -> TuningProcessPool:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_revert-pybind11-3-change.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_revert-pybind11-3-change.patch
new file mode 100644
index 00000000000..1b831f45fa5
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_revert-pybind11-3-change.patch
@@ -0,0 +1,68 @@
+Revert https://github.com/pytorch/pytorch/pull/161063
+
+The PR introduced changes required for the pybind11 3.x API which makes it incompatible with pybind11 2.x
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_cpu_cpp_wrapper.py b/test/inductor/test_cpu_cpp_wrapper.py
+index 47a8f3aa063..4b4daaef5c4 100644
+--- a/test/inductor/test_cpu_cpp_wrapper.py
++++ b/test/inductor/test_cpu_cpp_wrapper.py
+@@ -268,7 +268,7 @@ if RUN_CPU:
+             "test_multi_threading",
+             condition=not IS_WINDOWS,
+             # Two threads compile, so we expect the output code to be printed twice.
+-            code_string_count={"py::gil_scoped_release_simple release;": 2},
++            code_string_count={"py::gil_scoped_release release;": 2},
+         ),
+         BaseTest("test_profiler_mark_wrapper_call"),
+         BaseTest(
+diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
+index 83d1d061467..77f9c368ed3 100644
+--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
++++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
+@@ -585,7 +585,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
+                     # Weights are promoted in the JIT mode
+                     num_args = len(V.graph.graph_inputs) + len(V.graph.constants)
+                     # release GIL to support multiple instances inference (in different threads of the same process)
+-                    self.prefix.splice("py::gil_scoped_release_simple release;")
++                    self.prefix.splice("py::gil_scoped_release release;")
+ 
+                 self.prefix.splice(
+                     f"""
+@@ -2310,7 +2310,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
+ 
+         scoped_lines.writeline("{")
+         with scoped_lines.indent():
+-            scoped_lines.writeline("py::gil_scoped_acquire_simple acquire;")
++            scoped_lines.writeline("py::gil_scoped_acquire acquire;")
+             scoped_lines.writelines(lines_in_scope.split("\n"))
+         scoped_lines.writelines("}")
+         return scoped_lines._lines
+diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
+index 63c5bc2debe..fd145ece606 100644
+--- a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
++++ b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
+@@ -297,7 +297,7 @@ class CppWrapperCpuArrayRef(CppWrapperCpu):
+                         # Weights are promoted in the JIT mode
+                         num_args = len(V.graph.graph_inputs) + len(V.graph.constants)
+                         # release GIL to support multiple instances inference (in different threads of the same process)
+-                        self.prefix.splice("py::gil_scoped_release_simple release;")
++                        self.prefix.splice("py::gil_scoped_release release;")
+ 
+                     self.prefix.splice(
+                         f"""
+diff --git a/torch/csrc/inductor/cpp_wrapper/common.h b/torch/csrc/inductor/cpp_wrapper/common.h
+index a2eebfcc860..9d9ae16462c 100644
+--- a/torch/csrc/inductor/cpp_wrapper/common.h
++++ b/torch/csrc/inductor/cpp_wrapper/common.h
+@@ -6,7 +6,8 @@
+ #include <utility>
+ 
+ #include <Python.h>
+-#include <pybind11/gil_simple.h>
++#define PYBIND11_SIMPLE_GIL_MANAGEMENT
++#include <pybind11/gil.h>
+ 
+ // Include some often-used cpp_wrapper headers, for precompiling.
+ #include <c10/util/BFloat16.h>
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch
new file mode 100644
index 00000000000..b0a55ad4912
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch
@@ -0,0 +1,23 @@
+inductor/test_benchmark_fusion.py BenchmarkingTest.test_benchmark_on_non_zero_device fails with
+>     self.assertTrue(hit_count > 0)
+> AssertionError: False is not true
+
+Related: https://github.com/pytorch/pytorch/issues/160514
+
+Author: Alexander Grund (TU Dresden)
+
+--- a/test/inductor/test_benchmark_fusion.py
++++ b/test/inductor/test_benchmark_fusion.py
+@@ -206,10 +206,7 @@ if HAS_CUDA_AND_TRITON:
+     copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionCudaTest, "cuda")
+ 
+     class BenchmarkingTest(TestCase):
+-        @unittest.skipIf(
+-            torch.cuda.device_count() < 2, "The test need at least 2 devices"
+-        )
+-        @skip_if_cpp_wrapper("This tests triton scheduling directly")
++        @unittest.skip("Mocking fails")
+         def test_benchmark_on_non_zero_device(self):
+             hit_count = 0
+             with torch.cuda.device("cuda:0"):
+
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_convolution1-on-H100.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_convolution1-on-H100.patch
new file mode 100644
index 00000000000..e0c0a45b341
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_convolution1-on-H100.patch
@@ -0,0 +1,30 @@
+test_select_algorithm.py TestSelectAlgorithm.test_convolution1 fails on H100 with:
+
+> Mismatched elements: 19584 / 23120 (84.7%)
+> Greatest absolute difference: 132.32015991210938 at index (0, 22, 4, 13) (up to 0.0001 allowed)
+> Greatest relative difference: inf at index (0, 0, 1, 0) (up to 0.0001 allowed)
+
+See https://github.com/pytorch/pytorch/issues/143412
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
+index b30cdc2d946..25d3c068133 100644
+--- a/test/inductor/test_select_algorithm.py
++++ b/test/inductor/test_select_algorithm.py
+@@ -27,6 +27,7 @@ from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm, skipIfXpu
+ from torch.testing._internal.inductor_utils import (
+     GPU_TYPE,
+     HAS_GPU,
++    IS_H100,
+     requires_gpu,
+     requires_triton,
+ )
+@@ -295,6 +296,7 @@ class TestSelectAlgorithm(TestCase):
+         foo(torch.randn(64, 64, device=GPU_TYPE))
+         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+ 
++    @unittest.skipIf(IS_H100, "Fails on H100, see #143412")
+     @expectedFailureDynamicWrapper
+     @patches
+     def test_convolution1(self):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch
new file mode 100644
index 00000000000..fe992ece4f5
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch
@@ -0,0 +1,19 @@
+The test fails with
+> RuntimeError: Expected to find "buf0 = torch.ops._c10d_functional.all_gather_into_tensor_coalesced.default([arg3_1, arg2_1, arg1_1, arg0_1]" but did not find it
+
+Also upstream: https://github.com/pytorch/pytorch/issues/146806
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
+index bafc781b591..60fc47f63e4 100644
+--- a/test/distributed/test_c10d_functional_native.py
++++ b/test/distributed/test_c10d_functional_native.py
+@@ -997,7 +997,7 @@ class CompileTest(TestCase):
+         AOTIRunnerUtil.run(func, (arg,))
+         torch.cuda.synchronize()
+ 
+-    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
++    @unittest.skip("Fails")
+     @fresh_cache()
+     def test_inductor_all_gather_into_tensor_coalesced(self):
+         def func(args: list[torch.Tensor]) -> torch.Tensor:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch
new file mode 100644
index 00000000000..88d176f6051
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch
@@ -0,0 +1,19 @@
+Skip test_pad_mm.py PadMMTest.test_original_aten_preserved_pad_mm failing on:
+> File "/dev/shm/pytorch-v2.9.1/test/inductor/test_pad_mm.py", line 538, in test_original_aten_preserved_pad_mm
+>   self.assertEqual(counters["inductor"]["pattern_matcher_count"], 1)
+
+See https://github.com/pytorch/pytorch/issues/170562
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
+index 781f4588e14..b6f0fcebb3c 100644
+--- a/test/inductor/test_pad_mm.py
++++ b/test/inductor/test_pad_mm.py
+@@ -508,6 +508,7 @@ class PadMMTest(TestCase):
+ 
+         assert torch.allclose(res2, mm_expected_result), "MM results are not identical"
+ 
++    @unittest.skip("Fails")
+     @fresh_cache()
+     @inductor_config.patch(
+         {
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_override-without-CUDA.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_override-without-CUDA.patch
new file mode 100644
index 00000000000..bc2b927e0a0
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_override-without-CUDA.patch
@@ -0,0 +1,35 @@
+This test fails during creation of the tests at startup:
+>    File "/var/lib/jenkins/workspace/test/test_overrides.py", line 683, in _simple_type_parser
+>     return torch.Stream()
+> RuntimeError: CUDA error: CUDA driver version is insufficient for CUDA runtime version
+
+See https://github.com/pytorch/pytorch/pull/166625
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_overrides.py b/test/test_overrides.py
+index 8454677856d..8df233e279f 100644
+--- a/test/test_overrides.py
++++ b/test/test_overrides.py
+@@ -9,9 +9,9 @@ import pprint
+ import pickle
+ import collections
+ import unittest
+-import os
++import contextlib
+ 
+-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_CROSSREF
++from torch.testing._internal.common_utils import TestCase, run_tests, TEST_CUDA, TEST_WITH_CROSSREF
+ from torch.overrides import (
+     handle_torch_function,
+     has_torch_function,
+@@ -30,8 +30,7 @@ from torch.utils._pytree import tree_map
+ 
+ Tensor = torch.Tensor
+ 
+-if os.getenv("ATEN_CPU_CAPABILITY") in ("default", "avx2"):
+-    # This test is not supported on ARM
++if not TEST_CUDA:
+     print(
+         "Skipping due to failing when cuda build runs on non cuda machine, "
+         + "see https://github.com/pytorch/pytorch/pull/150059 for example"
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_unbacked_reduction.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_unbacked_reduction.patch
new file mode 100644
index 00000000000..bfb54615bf5
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_unbacked_reduction.patch
@@ -0,0 +1,18 @@
+TestInductorDynamicCPU.test_unbacked_reduction_cpu doesn't only fail on ROCM with:
+> AssertionError: expected to fail, but actually passed
+
+
+See https://github.com/pytorch/pytorch/issues/154217
+
+Author: Alexander Grund (TU Dresden)
+
+--- a/test/inductor/test_torchinductor_dynamic_shapes.py
++++ b/test/inductor/test_torchinductor_dynamic_shapes.py
+@@ -513,6 +513,7 @@ class TestInductorDynamic(TestCase):
+         ).sum().backward()
+         self.assertEqual(t.grad, expect)
+ 
++    @unittest.skip("Fails on CPU")
+     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+     def test_unbacked_reduction(self, device):
+         expect_fail = (
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch
new file mode 100644
index 00000000000..a4aadc780df
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch
@@ -0,0 +1,122 @@
+These tests use Triton to generate PTX code and then compile that with NVCC.
+
+As Triton 3.5 uses PTXAS from CUDA 12.8 it cannot be compiled with NVCC from CUDA 12.6.
+
+Failures look like: 
+> ptxas /tmp/torchinductor_s3248973/bvqcnu2o7/2mwinejhnbvqcnu2o73mk3zrx6.ptx, line 5; fatal : Unsupported .version 8.7; current version is '8.5'
+
+in following tests:
+- test_simple_multi_arch
+- test_compile_after_package_multi_arch
+- test_compile_after_package_static
+- test_compile_standalone_cos
+- test_compile_with_exporter
+- test_compile_with_exporter_weights
+
+See https://github.com/pytorch/pytorch/issues/168353
+
+Author: Alexander Grund (TU Dresden)
+
+--- a/test/inductor/test_aot_inductor.py
++++ b/test/inductor/test_aot_inductor.py
+@@ -39,7 +39,7 @@ from torch.export.pt2_archive._package import load_pt2
+ from torch.testing import FileCheck
+ from torch.testing._internal import common_utils
+ from torch.testing._internal.common_cuda import (
+-    _get_torch_cuda_version,
++    requires_triton_ptxas_compat,
+     PLATFORM_SUPPORTS_FLASH_ATTENTION,
+     PLATFORM_SUPPORTS_FP8,
+     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+@@ -239,9 +239,7 @@ class AOTInductorTestsTemplate:
+     # Skip embed_kernel_binary == True for now as it shows random
+     # failure on CI
+     @common_utils.parametrize("embed_kernel_binary", [False])
+-    @unittest.skipIf(
+-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+-    )
++    @requires_triton_ptxas_compat
+     def test_simple_multi_arch(self, embed_kernel_binary):
+         if self.device != GPU_TYPE:
+             raise unittest.SkipTest("requires GPU_TYPE")
+diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py
+index 0eb1057c802..843f63ff17d 100644
+--- a/test/inductor/test_aot_inductor_package.py
++++ b/test/inductor/test_aot_inductor_package.py
+@@ -27,7 +27,7 @@ from torch.export.pt2_archive._package import (
+     load_pt2,
+     load_weights_to_pt2_contents,
+ )
+-from torch.testing._internal.common_cuda import _get_torch_cuda_version
++from torch.testing._internal.common_cuda import _get_torch_cuda_version, requires_triton_ptxas_compat
+ from torch.testing._internal.common_utils import (
+     IS_FBCODE,
+     skipIfRocm,
+@@ -319,9 +319,7 @@ class TestAOTInductorPackage(TestCase):
+                 actual = optimized(*example_inputs)
+                 self.assertTrue(torch.allclose(actual, expected))
+ 
+-    @unittest.skipIf(
+-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+-    )
++    @requires_triton_ptxas_compat
+     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+     @skipIfRocm  # doesn't support multi-arch binary
+     @skipIfXpu  # doesn't support multi-arch binary
+@@ -366,9 +364,7 @@ class TestAOTInductorPackage(TestCase):
+                 actual = optimized(*example_inputs)
+                 self.assertTrue(torch.allclose(actual, expected))
+ 
+-    @unittest.skipIf(
+-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+-    )
++    @requires_triton_ptxas_compat
+     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+     @skipIfXpu  # build system may be different
+     @torch._inductor.config.patch("test_configs.use_libtorch", True)
+@@ -429,6 +425,7 @@ class TestAOTInductorPackage(TestCase):
+                 self.cmake_compile(model, example_inputs, options, "")
+ 
+     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
++    @requires_triton_ptxas_compat
+     @skipIfXpu  # build system may be different
+     @torch._inductor.config.patch("test_configs.use_libtorch", True)
+     def test_compile_standalone_cos(self):
+@@ -461,9 +458,7 @@ class TestAOTInductorPackage(TestCase):
+                 a_path = build_path / "libcos.a"
+                 self.assertTrue(a_path.exists())
+ 
+-    @unittest.skipIf(
+-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+-    )
++    @requires_triton_ptxas_compat
+     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+     @skipIfRocm  # doesn't support multi-arch binary
+     @skipIfXpu  # doesn't support multi-arch binary
+@@ -519,9 +514,7 @@ class TestAOTInductorPackage(TestCase):
+                             " 0  0  0\n 0  0  0\n[ CPUFloatType{3,3} ]\n",
+                         )
+ 
+-    @unittest.skipIf(
+-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+-    )
++    @requires_triton_ptxas_compat
+     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+     @skipIfRocm  # doesn't support multi-arch binary
+     @skipIfXpu  # doesn't support multi-arch binary
+diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
+index be284429114..3bd0e0a904f 100644
+--- a/torch/testing/_internal/common_cuda.py
++++ b/torch/testing/_internal/common_cuda.py
+@@ -373,6 +373,11 @@ def xfailIfSM120OrLater(func):
+ def xfailIfDistributedNotSupported(func):
+     return func if not (IS_MACOS or IS_JETSON) else unittest.expectedFailure(func)
+ 
++# When using nvcc from the CUDA toolkit its versuib must be at least the one from ptxas bundled with Triton
++TRITON_PTXAS_VERSION = (12, 8)
++requires_triton_ptxas_compat = unittest.skipIf(torch.version.hip is None and _get_torch_cuda_version() < TRITON_PTXAS_VERSION,
++                                               "Requires CUDA 12.8 to match Tritons ptxas version")
++
+ # Importing this module should NOT eagerly initialize CUDA
+ if not CUDA_ALREADY_INITIALIZED_ON_IMPORT:
+     assert not torch.cuda.is_initialized()
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch
new file mode 100644
index 00000000000..3667657cc17
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch
@@ -0,0 +1,104 @@
+Unexpected success in e.g. TestExportOpInfoCPU.test_fake_export___getitem___cpu_float32
+
+Same with PYPI package and reported in https://github.com/pytorch/pytorch/pull/164166
+
+Skip all instead of XFailing
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/export/test_export_opinfo.py b/test/export/test_export_opinfo.py
+index 35d8b2895bd..409a305a3aa 100644
+--- a/test/export/test_export_opinfo.py
++++ b/test/export/test_export_opinfo.py
+@@ -22,54 +22,54 @@ from torch.utils import _pytree as pytree
+ 
+ # following are failing with regular torch.export.export
+ export_failures = {
+-    xfail("allclose"),
+-    xfail("combinations"),
+-    xfail("corrcoef"),
+-    xfail("cov"),
+-    xfail("equal"),
+-    xfail("linalg.lstsq"),
+-    xfail("linalg.lstsq", "grad_oriented"),
+-    xfail("nn.functional.ctc_loss"),
+-    xfail("nn.functional.gaussian_nll_loss"),
+-    xfail("sparse.sampled_addmm"),
+-    xfail("tensor_split"),
++    skip("allclose"),
++    skip("combinations"),
++    skip("corrcoef"),
++    skip("cov"),
++    skip("equal"),
++    skip("linalg.lstsq"),
++    skip("linalg.lstsq", "grad_oriented"),
++    skip("nn.functional.ctc_loss"),
++    skip("nn.functional.gaussian_nll_loss"),
++    skip("sparse.sampled_addmm"),
++    skip("tensor_split"),
+ }
+ 
+ # following are failing fake export on cuda device
+ fake_export_failures = {
+-    xfail("geqrf"),
+-    xfail("histogram"),
+-    xfail("masked.amax"),
+-    xfail("masked.amin"),
+-    xfail("masked.argmax"),
+-    xfail("masked.argmin"),
+-    xfail("masked.logaddexp"),
+-    xfail("masked.logsumexp"),
+-    xfail("masked.mean"),
+-    xfail("masked.prod"),
+-    xfail("masked.std"),
+-    xfail("masked.sum"),
+-    xfail("masked.var"),
+-    xfail("nn.functional.grid_sample"),
+-    xfail("to_sparse"),
++    skip("geqrf"),
++    skip("histogram"),
++    skip("masked.amax"),
++    skip("masked.amin"),
++    skip("masked.argmax"),
++    skip("masked.argmin"),
++    skip("masked.logaddexp"),
++    skip("masked.logsumexp"),
++    skip("masked.mean"),
++    skip("masked.prod"),
++    skip("masked.std"),
++    skip("masked.sum"),
++    skip("masked.var"),
++    skip("nn.functional.grid_sample"),
++    skip("to_sparse"),
+     # cannot xfail as it is passing for cpu-only build
+     skip("nn.functional.conv2d"),
+     skip("nn.functional.scaled_dot_product_attention"),
+     # following are failing due to OptionalDeviceGuard
+-    xfail("__getitem__"),
+-    xfail("nn.functional.batch_norm"),
+-    xfail("nn.functional.instance_norm"),
+-    xfail("nn.functional.multi_margin_loss"),
+-    xfail("nonzero"),
++    skip("__getitem__"),
++    skip("nn.functional.batch_norm"),
++    skip("nn.functional.instance_norm"),
++    skip("nn.functional.multi_margin_loss"),
++    skip("nonzero"),
+ }
+ 
+ fake_decomposition_failures = {
+-    xfail("linalg.matrix_rank"),
+-    xfail("nn.functional.binary_cross_entropy_with_logits"),
+-    xfail("nn.functional.instance_norm"),
+-    xfail("nn.functional.multi_margin_loss"),
+-    xfail("repeat_interleave"),
+-    xfail("take"),
++    skip("linalg.matrix_rank"),
++    skip("nn.functional.binary_cross_entropy_with_logits"),
++    skip("nn.functional.instance_norm"),
++    skip("nn.functional.multi_margin_loss"),
++    skip("repeat_interleave"),
++    skip("take"),
+ }
+ 
+ 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch
new file mode 100644
index 00000000000..4050e2e148c
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch
@@ -0,0 +1,49 @@
+Revert part of https://github.com/pytorch/pytorch/pull/158905
+
+We use Z3 4.13.0 which has different output in this case compared to 4.13.1+.
+This causes failures in ExcTests.test_trigger_on_error & ExcTests.test_trigger_bisect_on_error
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/dynamo/test_exc.py b/test/dynamo/test_exc.py
+--- a/test/dynamo/test_exc.py
++++ b/test/dynamo/test_exc.py
+@@ -253,13 +253,13 @@ translation validation failed.
+ 
+ Model:
+   ==> L['shape'][0]: 0
+-  ==> L['shape'][1]: 0
+-  ==> L['shape'][2]: 0
++  ==> L['shape'][1]: 1
++  ==> L['shape'][2]: 1
+   ==> L['x'].size()[0]: 3
+   ==> L['x'].storage_offset(): 0
+   ==> L['x'].stride()[0]: 1
+-  ==> s3: 0
+-  ==> s52: 0
++  ==> s3: 1
++  ==> s52: 1
+   ==> s77: 3
+   ==> s86: 0
+ 
+@@ -317,16 +317,16 @@ Failure occurred while running node:
+     %split : [num_users=3] = call_method[target=split](args = (%l_x_, (%l_shape_0_, %l_shape_1_, %l_shape_2_)), kwargs = {})
+ 
+ Model:
+-  ==> L['shape'][0]: 0
+-  ==> L['shape'][1]: 0
++  ==> L['shape'][0]: 1
++  ==> L['shape'][1]: 1
+   ==> L['shape'][2]: 0
+   ==> L['x'].size()[0]: 3
+   ==> L['x'].storage_offset(): 0
+   ==> L['x'].stride()[0]: 1
+   ==> s3: 0
+-  ==> s52: 0
++  ==> s52: 1
+   ==> s77: 3
+-  ==> s86: 0
++  ==> s86: 1
+ 
+ Assertions:
+   ==> (== 0 L['x'].storage_offset())
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a-CUDA-12.6.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a-CUDA-12.6.0.eb
new file mode 100644
index 00000000000..bf1333f3221
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a-CUDA-12.6.0.eb
@@ -0,0 +1,279 @@
+name = 'PyTorch'
+version = '2.9.1'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+
+toolchain = {'name': 'foss', 'version': '2024a'}
+
+local_six_version = '1.11.0'
+# This is specific to a (tagged) release.
+# Extract from `get_disabled_tests` in tools/stats/import_test_stats.py
+local_disabled_tests_S3_ID = 'UsscdNP.2GMOzUxAvqIx8GAj4MuhX1Xi'
+source_urls = [GITHUB_RELEASE]
+sources = [
+    '%(namelower)s-v%(version)s.tar.gz',
+    {
+        'filename': '%(name)s-%(version)s-disabled-tests.json',
+        'download_filename': f'disabled-tests-condensed.json?versionId={local_disabled_tests_S3_ID}',
+        'source_urls': ['https://ossci-metrics.s3.amazonaws.com'],
+        # See `DEFAULT_DISABLED_TESTS_FILE` in torch/testing/_internal/common_utils.py
+        'extract_cmd': 'cp %s %(builddir)s/pytorch-v%(version)s/test/.pytorch-disabled-tests.json',
+    },
+    {
+        # Avoid downloading this during the build, see third_party/NNPACK/cmake/DownloadSix.cmake for the version
+        'filename': f'six-{local_six_version}.tar.gz',
+        'source_urls': ['https://pypi.python.org/packages/source/s/six'],
+    }
+]
+patches = [
+    'PyTorch-1.12.1_add-hypothesis-suppression.patch',
+    'PyTorch-1.7.0_disable-dev-shm-test.patch',
+    'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
+    'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
+    'PyTorch-2.6.0_fix-server-in-test_control_plane.patch',
+    'PyTorch-2.6.0_show-test-duration.patch',
+    'PyTorch-2.6.0_skip-test_segfault.patch',
+    'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch',
+    'PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch',
+    'PyTorch-2.7.1_skip-test_data_parallel_rnn.patch',
+    'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch',
+    'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch',
+    'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch',
+    'PyTorch-2.7.1_suport-64bit-BARs.patch',
+    'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch',
+    'PyTorch-2.9.0_disable-test_nan_assert.patch',
+    'PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch',
+    'PyTorch-2.9.0_fix-attention-squeeze.patch',
+    'PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch',
+    'PyTorch-2.9.0_fix-nccl-test-env.patch',
+    'PyTorch-2.9.0_fix-test_exclude_padding.patch',
+    'PyTorch-2.9.0_fix-test_version_error.patch',
+    'PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch',
+    'PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch',
+    'PyTorch-2.9.0_remove-faulty-close.patch',
+    'PyTorch-2.9.0_revert-pybind11-3-change.patch',
+    'PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch',
+    'PyTorch-2.9.0_skip-test_convolution1-on-H100.patch',
+    'PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch',
+    'PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch',
+    'PyTorch-2.9.0_skip-test_override-without-CUDA.patch',
+    'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch',
+    'PyTorch-2.9.0_skip-test_unbacked_reduction.patch',
+    'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch',
+    'PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch',
+    'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch',
+    'PyTorch-2.9.1_check-device-avail-test_schedule.patch',
+    'PyTorch-2.9.1_disable-slow-tests.patch',
+    'PyTorch-2.9.1_fix-hypothesis-deadline.patch',
+    'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch',
+    'PyTorch-2.9.1_fix-test_dist2-decorators.patch',
+    'PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch',
+    'PyTorch-2.9.1_GCC14-ARM-workaround.patch',
+    'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch',
+    'PyTorch-2.9.1_normalize_tree_output.patch',
+    'PyTorch-2.9.1_set-test-timeout.patch',
+    'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch',
+    'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch',
+    'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
+    'PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch',
+    'PyTorch-2.9.1_skip-tests-requiring-SM90.patch',
+]
+checksums = [
+    {'pytorch-v2.9.1.tar.gz': 'e17504700ebc4c87f9b57059df1c4d790b769458c04db144c7a92aea90f2c92b'},
+    {'PyTorch-2.9.1-disabled-tests.json': '471f8aa36e056173d09ffd421ead45539a8d35fec6e61a8a0050d92a5fcd9f04'},
+    {'six-1.11.0.tar.gz': '70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9'},
+    {'PyTorch-1.12.1_add-hypothesis-suppression.patch':
+     'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
+    {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
+    {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
+     '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
+    {'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
+     '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
+    {'PyTorch-2.6.0_fix-server-in-test_control_plane.patch':
+     '1337689ff28ecaa8d1d0edf60d322bcdd7846fec040925325d357b19eb6e4342'},
+    {'PyTorch-2.6.0_show-test-duration.patch': '5508f2f9619204d9f3c356dbd4000a00d58f452ab2d64ae920eb8bc8b5484d75'},
+    {'PyTorch-2.6.0_skip-test_segfault.patch': '26806bd62e6b61b56ebaa52d68ca44c415a28124f684bd2fb373557ada68ef52'},
+    {'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch':
+     '2f3255e067f5c6f0d78b4fbce94784c41bddf3d01bab9673856b0d0bbc4e3fec'},
+    {'PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch':
+     'aaf22cb431357dc78e4db895d64febf1c7ee187e8ad27bd13544d011127354d4'},
+    {'PyTorch-2.7.1_skip-test_data_parallel_rnn.patch':
+     'aa85b678e89db4bb41d2c5f4990f0d05959be92e61918291cb5609685b7f1841'},
+    {'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch':
+     '503030c3591196510a3c2d95db30b28a0b396adb8b50ff0d221f6bdb1f939935'},
+    {'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch':
+     '709288abc802c9eb687c15f2677ebaf408d8325a4cb470d23cb72447ee0b8e13'},
+    {'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch':
+     '4916a256b2b9914e4fdb930681b80df93ea561ddee2fc9978c4973a5650be5e9'},
+    {'PyTorch-2.7.1_suport-64bit-BARs.patch': '317c3d220aa87426d86e137a6c1a8f910adf9580ca0848371e0f6800c05dbde1'},
+    {'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch':
+     'f304440a57e00b8052a5ffbf285adad8d0fdc5a812a659420b59a20deb5a9942'},
+    {'PyTorch-2.9.0_disable-test_nan_assert.patch': '98e9f98ce8fb89ae368739bc039be69040ed446a1c74ee5c2a1ef8ba60986c7d'},
+    {'PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch':
+     'ba4032b967c0393c916a26fb2b117ba40670ae8e809cb34399a6379b4e523d72'},
+    {'PyTorch-2.9.0_fix-attention-squeeze.patch': '8f040e74780cab391bb4c84f86390a13230e1a309ddf65db9900d9a1c66e1288'},
+    {'PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch':
+     'b696d7be8c55ff1ccf8731dccf119b8792cd9593eaff457f37e76114e52346d2'},
+    {'PyTorch-2.9.0_fix-nccl-test-env.patch': '9326223c400262788734ec608f6134c5d240f4d5315a8d294179a28f885d6845'},
+    {'PyTorch-2.9.0_fix-test_exclude_padding.patch':
+     '349850874fb75d57a24437d871a4994a773e501632ce66a2adca613380a152dc'},
+    {'PyTorch-2.9.0_fix-test_version_error.patch': 'b10bb10d0a353e4ba7dbef28ca5fef03a8ba552896e1982708aa90ab6f24f34f'},
+    {'PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch': '239631258431174e4aed8947ae6096e003a3213bfbfa112cd0cdebae89469164'},
+    {'PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch':
+     'c27ab34900835c2a15edc26d481343a16433bfa52f635a80cbab252c1320a545'},
+    {'PyTorch-2.9.0_remove-faulty-close.patch': '32ca744d68dcfa669e46ced9d2776af3dcc380dd9c3458ba7c1c432e5c5295b3'},
+    {'PyTorch-2.9.0_revert-pybind11-3-change.patch':
+     '5289894011fefc67482b1e19c9d1c502e94a943fc7a2d5ed5a6a1eaf444570a0'},
+    {'PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch':
+     '85e236431d1a5da3fb7fccc2554640898c29f5fab46a41d15b3ab61dd1f924fc'},
+    {'PyTorch-2.9.0_skip-test_convolution1-on-H100.patch':
+     '704750c7cc08b58779907d608cd4b7505043e394fb27530b16d72a0dc27c277e'},
+    {'PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch':
+     '644153d4c1d8267c0631df2902a6dfe8ec2a197f3374f2a2f5654e6bd0edc05e'},
+    {'PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch':
+     'ac9e05d296cd5ff938a44662cd022efcc8133c744ca82b045c6a15bc64f67cf4'},
+    {'PyTorch-2.9.0_skip-test_override-without-CUDA.patch':
+     '967512d1487bf1ad06982cc5b976c0b38ba062c3f3473cb4542c4b9ac0740662'},
+    {'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch':
+     '6d79aff5291627b86d8fea025bf2379e4065c7d9cbef5cf83452c35922848728'},
+    {'PyTorch-2.9.0_skip-test_unbacked_reduction.patch':
+     'b51dd5d7c9cfeed946cbc5c7fc22f2e78e1fa52dda55569b957c20ca4ed01fe8'},
+    {'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch':
+     '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'},
+    {'PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch':
+     '5c68e0de73212ed266879f4528a6041ef7ab2f1ac83c6cf7142c4baa78e7664c'},
+    {'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch':
+     '86ce380e69b3b20e010d817889cb1b825b05b4054a045b00f2ac12161b77d7e4'},
+    {'PyTorch-2.9.1_check-device-avail-test_schedule.patch':
+     '64c28d38ce69147565509add36d310473ce46f14a0a876d38b5049cb7fce9817'},
+    {'PyTorch-2.9.1_disable-slow-tests.patch': '76e6d8f7366b91a0ddc65f73685f2b09988bb5537d10d294f9bb6a48c7fec3d0'},
+    {'PyTorch-2.9.1_fix-hypothesis-deadline.patch': 'f7a130669eee9924a303df9e2bd5743ff023a7d994b7a3e43c86dcccf0206c49'},
+    {'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch':
+     'ab408275ec66e836112a50054acc4e789ef38196efeb6137c6061d60d9ac9ead'},
+    {'PyTorch-2.9.1_fix-test_dist2-decorators.patch':
+     'bf4ed805f00775ed33351de7bce40ebf4eac16aff6c61d2e91790982bc43d73b'},
+    {'PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch':
+     'bdddf5a9ba47d57ec96f4bbefc3b85c4904e44de93dc5c7a65bc03e343035ae9'},
+    {'PyTorch-2.9.1_GCC14-ARM-workaround.patch': 'ea8a8662e20fae2fb3a74c7f8bf390aba80a598ab37f9131c720d25ebb14965d'},
+    {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch':
+     'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
+    {'PyTorch-2.9.1_normalize_tree_output.patch': '7d5994580339b73c28de595d9e5a0448db97b7d284f17efd18909e4613d170df'},
+    {'PyTorch-2.9.1_set-test-timeout.patch': '15fa1149c250b1333b0bc491f659aaf89d5d6eaf6df5ebc81eea545478c1239c'},
+    {'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch':
+     'd8489c192da549083569e09e5f94d2a83c9e41e111b1322f86512a9c5a58c0d9'},
+    {'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch':
+     'e544f765beac7bdb3fc0ada98a3f92fd7e511ed8874de085aa2f213cca769d40'},
+    {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch':
+     '3cf0b11136fb18c45072687eafd3024d91b504d231a4fa40e04bc62d8d6019c7'},
+    {'PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch':
+     'e57486cc42f3dbcae29753168febc251d070a283229e2d76ccbdf19fee53f06e'},
+    {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch':
+     '7db02152db2ae70c0fd4c4602fe381e26a74b8e4f7b16b1a3554b2353d761b10'},
+]
+
+osdependencies = [OS_PKG_IBVERBS_DEV]
+
+builddependencies = [
+    ('CMake', '3.29.3'),
+    ('hypothesis', '6.103.1'),
+    ('setuptools', '80.9.0'),
+    # For tests
+    ('parameterized', '0.9.0'),
+    ('pytest-flakefinder', '1.1.0'),
+    ('pytest-rerunfailures', '15.0'),
+    ('pytest-shard', '0.1.2'),
+    ('pytest-subtests', '0.13.1'),
+    ('tlparse', '0.4.0'),
+    ('optree', '0.14.1'),
+    ('unittest-xml-reporting', '3.1.0'),
+]
+
+dependencies = [
+    ('CUDA', '12.6.0', '', SYSTEM),
+    # PyTorch is very sensitive to the NCCL & cuDNN versions. (Maybe the same for cuSPARSELt)
+    # Prefer those (listed per CUDA version) in
+    # https://github.com/pytorch/pytorch/blob/main/.github/scripts/generate_binary_build_matrix.py
+    # or https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_cuda.sh
+    ('NCCL', '2.27.5', versionsuffix),
+    ('cuDNN', '9.10.2.21', versionsuffix, SYSTEM),
+    ('magma', '2.9.0', versionsuffix),
+    ('cuSPARSELt', '0.6.3.2', versionsuffix, SYSTEM),
+    # Version from .ci/docker/triton_version.txt
+    ('Triton', '3.5.0', versionsuffix),
+    ('Ninja', '1.12.1'),  # Required for JIT compilation of C++ extensions
+    ('Python', '3.12.3'),
+    ('Python-bundle-PyPI', '2024.06'),
+    ('expecttest', '0.2.1'),
+    ('GMP', '6.3.0'),
+    ('MPFR', '4.2.1'),
+    ('networkx', '3.4.2'),
+    ('numactl', '2.0.18'),
+    ('Pillow', '10.4.0'),
+    ('protobuf-python', '5.28.0'),
+    ('protobuf', '28.0'),
+    ('pybind11', '2.12.0'),
+    ('PuLP', '2.8.0'),
+    ('PyYAML', '6.0.2'),
+    ('pyzstd', '0.16.2'),
+    ('SciPy-bundle', '2024.05'),
+    ('sympy', '1.13.3'),
+    ('Z3', '4.13.0',),
+]
+
+prebuildopts = (f"""sed -i '1i set(PYTHON_SIX_SOURCE_DIR "%(builddir)s/six-{local_six_version}")' """
+                "cmake/Dependencies.cmake && ")
+buildcmd = '%(python)s setup.py build'  # Run the (long) build in the build step
+
+excluded_tests = {
+    '': [
+        # This test seems to take too long on NVIDIA Ampere at least.
+        'distributed/test_distributed_spawn',
+        # no xdoctest
+        'doctests',
+        # intermittent failures on various systems
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'distributed/rpc/test_tensorpipe_agent',
+        # This test is expected to fail when run in their CI, but won't in our case.
+        # It just checks for a "CI" env variable
+        'test_ci_sanity_check_fail',
+        # Requires pwlf Python package
+        'distributed/_tools/test_sac_ilp', 'distributed/_tools/test_sac_estimator',
+        # 9 failures in H100, 7 are present in PYPI package, 2 are related to GC in Python < 3.12.4
+        'dynamo/test_dynamic_shapes',
+        # Broken test: https://github.com/pytorch/pytorch/issues/162179
+        'distributed/_composable/fsdp/test_fully_shard_logging',
+        # Broken: https://github.com/pytorch/pytorch/issues/137027
+        'inductor/test_extension_backend',
+        # Requires optional Python packages
+        'test_public_bindings',
+        # 1 Failure and not important
+        'dynamo/test_utils',
+        # Packaging test only, not important for us
+        'test_license',
+    ]
+}
+
+runtest = (
+    # Disable symbol resolution in stack traces that can cause hangs and slowdowns
+    ' TORCH_DISABLE_ADDR2LINE=1'
+    ' TORCHINDUCTOR_CUTLASS_DIR=%(start_dir)s/third_party/cutlass'
+    ' PYTEST_ADDOPTS=--full-trace'
+    ' PYTHONUNBUFFERED=1'
+    ' %(python)s test/run_test.py'
+    ' --continue-through-error --pipe-logs --verbose'
+    ' %(excluded_tests)s'
+)
+
+postinstallcmds = [
+    "mkdir %(installdir)s/extra",
+    "cp -r third_party/cutlass %(installdir)s/extra/",
+]
+
+modextrapaths = {'TORCHINDUCTOR_CUTLASS_DIR': 'extra/cutlass'}
+
+tests = ['PyTorch-check-cpp-extension.py', 'PyTorch-check-cutlass.py']
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_GCC14-ARM-workaround.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_GCC14-ARM-workaround.patch
new file mode 100644
index 00000000000..e0504c90d06
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_GCC14-ARM-workaround.patch
@@ -0,0 +1,53 @@
+From 8fd509399e25cb4b265dff663d3f777406001f2e Mon Sep 17 00:00:00 2001
+From: Nikita Shulga <2453524+malfet@users.noreply.github.com>
+Date: Tue, 10 Feb 2026 04:35:39 +0000
+Subject: [PATCH] Blunter GCC 14.2.0 workaround for SVE compilation (#174647)
+
+Updated preprocessor directive for GCC version check and removed BF16 condition. I.e. right now SVE256 compilation with gcc-14.2 on Debian13 for ` -march=armv8-a+sve+bf16`
+
+Without the fix, compilation fails with
+```
+In file included from /home/dev/git/pytorch/pytorch/build/aten/src/ATen/native/cpu/Unfold2d.cpp.SVE256.cpp:1:
+/home/dev/git/pytorch/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp: In function 'void at::native::{anonymous}::unfolded2d_acc_kernel(c10::ScalarType, void*, void*, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, bool)':
+/home/dev/git/pytorch/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp:225:1: error: unrecognizable insn:
+  225 | }
+      | ^
+(insn 1371 1370 1372 101 (set (reg:VNx16BI 3235)
+        (unspec:VNx16BI [
+                (reg:VNx16BI 3232)
+                (reg:VNx8BI 3234)
+                (const_vector:VNx4BI [
+                        (const_int 0 [0]) repeated x8
+                    ])
+            ] UNSPEC_TRN1_CONV)) "/home/dev/git/pytorch/pytorch/torch/headeronly/util/bit_cast.h":40:14 -1
+     (nil))
+during RTL pass: vregs
+/home/dev/git/pytorch/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp:225:1: internal compiler error: in extract_insn, at recog.cc:2812
+```
+
+Not sure what compelled me to put such a narrow restriction in https://github.com/pytorch/pytorch/pull/157867
+
+Fixes https://github.com/pytorch/pytorch/issues/172630
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/174647
+Approved by: https://github.com/seemethere
+---
+ aten/src/ATen/native/cpu/Unfold2d.cpp | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
+index ed69998e99f79..9ae1391e2603e 100644
+--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
++++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
+@@ -169,8 +169,9 @@ void unfolded2d_acc_channels_last(
+ 
+ /* note: due to write issues, this one cannot be parallelized as well as
+  * unfolded2d_copy */
+-#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16)
+-// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16
++#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE)
++// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
++// NS: With or without BF16, see https://github.com/pytorch/pytorch/issues/172630
+ __attribute__((optimize("no-tree-vectorize")))
+ #endif
+ void unfolded2d_acc_kernel(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch
new file mode 100644
index 00000000000..75e8fa00ca0
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch
@@ -0,0 +1,57 @@
+A crashed child process in a test might cause the parent to never complete.
+Use a timeout to avoid that.
+See https://github.com/pytorch/pytorch/pull/171972
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
+index c1f75697fe8..47661c7a1fa 100644
+--- a/torch/testing/_internal/common_distributed.py
++++ b/torch/testing/_internal/common_distributed.py
+@@ -621,6 +621,33 @@ def cleanup_temp_dir() -> None:
+         tmp_dir.cleanup()
+ 
+ 
++def retrieve_result_from_process_queue(
++    process: torch.multiprocessing.Process,
++    completion_queue: torch.multiprocessing.Queue,
++    timeout: Optional[int] = None,
++) -> Any:
++    """Get result from queue associated with process.
++
++    When the process finished without putting a result or the timeout expired an exception instance will be returned"""
++    queue_timeout = 120 if timeout is None else max(10, min(120, timeout // 4))
++    start_time = time.time()
++    # Periodically check the process for liveness
++    while True:
++        try:
++            return completion_queue.get(timeout=queue_timeout)
++        except queue.Empty:
++            # If not alive do a last check because the timeout might have happened just before completion
++            if not process.is_alive() and completion_queue.empty():
++                # Clean up process to avoid keeping a zombie process
++                process.terminate()  # Just to be sure
++                process.join(600)  # Usually completes immediately
++                return RuntimeError(f"Exited with {process.exitcode}")
++        if timeout is not None:
++            elapsed = time.time() - start_time
++            if elapsed > timeout:
++                return RuntimeError(f"Process timeout out after {elapsed}s")
++
++
+ # Most tests operate with this worldsize
+ DEFAULT_WORLD_SIZE = 4
+ 
+@@ -1786,8 +1813,10 @@ class MultiProcContinuousTest(TestCase):
+             if self.rank == self.MAIN_PROCESS_RANK:
+                 logger.debug(f"Waiting for workers to finish {self.id()}")  # noqa: G004
+                 # Wait for the workers to finish the test
+-                for i, completion_queue in enumerate(self.completion_queues):
+-                    rv = completion_queue.get()
++                for i, (p, completion_queue) in enumerate(
++                    zip(self.processes, self.completion_queues)
++                ):
++                    rv = retrieve_result_from_process_queue(p, completion_queue)
+                     if isinstance(rv, BaseException):
+                         # Hit an exception, re-raise it in the main process.
+                         logger.warning(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_check-device-avail-test_schedule.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_check-device-avail-test_schedule.patch
new file mode 100644
index 00000000000..202d1e4a1fc
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_check-device-avail-test_schedule.patch
@@ -0,0 +1,19 @@
+Some tests fail if no accelerator is available.
+> RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU [...]
+
+Check for availability to trigger CPU fallback.
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py
+index dabf3d78a6f..d3b8bf13168 100644
+--- a/test/distributed/pipelining/test_schedule.py
++++ b/test/distributed/pipelining/test_schedule.py
+@@ -53,7 +53,7 @@ from torch.testing._internal.distributed.fake_pg import FakeStore
+ 
+ ARTIFACTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "artifacts")
+ 
+-device = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
++device = acc.type if (acc := torch.accelerator.current_accelerator(check_available=True)) else "cpu"
+ logger = logging.getLogger(__name__)
+ torch.manual_seed(0)
+ 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch
new file mode 100644
index 00000000000..8f6d6e0c767
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch
@@ -0,0 +1,40 @@
+On CI it defaults to importing JSON files with slow and disabled tests.
+Those are then skipped upon execution.
+
+Enable the default for non-CI environments to cut down testing time.
+Don't check for SANDCASTLE when determining whether to skip disabled tests.
+However, the disabled-tests JSON file needs to be downloaded from S3 and placed at "tests/.pytorch-disabled-tests.json".
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/run_test.py b/test/run_test.py
+index 44a15d4ab2c..269d4206f3e 100755
+--- a/test/run_test.py
++++ b/test/run_test.py
+@@ -531,7 +531,7 @@ def run_test(
+ 
+     # NB: These features are not available for C++ tests, but there is little incentive
+     # to implement it because we have never seen a flaky C++ test before.
+-    if IS_CI and not is_cpp_test:
++    if not is_cpp_test:
+         ci_args = ["--import-slow-tests", "--import-disabled-tests"]
+         if RERUN_DISABLED_TESTS:
+             ci_args.append("--rerun-disabled-tests")
+diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
+index bfc568bc146..7ef37cccccb 100644
+--- a/torch/testing/_internal/common_utils.py
++++ b/torch/testing/_internal/common_utils.py
+@@ -2722,11 +2722,11 @@ def check_if_enable(test: unittest.TestCase):
+         if not TEST_WITH_SLOW:
+             raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test")
+ 
+-    if not IS_SANDCASTLE:
++    if True:
+         should_skip = False
+         skip_msg = ""
+ 
+-        for disabled_test, (issue_url, platforms) in disabled_tests_dict.items():
++        for disabled_test, (pr_num, issue_url, platforms) in disabled_tests_dict.items():
+             if matches_test(disabled_test):
+                 platform_to_conditional: dict = {
+                     "mac": IS_MACOS,
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch
new file mode 100644
index 00000000000..ebdfb00e0a3
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch
@@ -0,0 +1,20 @@
+Fixes a failure on systems with a single GPU.
+Error in `init_gpu_context` (fake_tensor.py:744):
+> E           torch.AcceleratorError: CUDA error: invalid device ordinal
+
+See: https://github.com/pytorch/pytorch/pull/164184
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/export/test_export_opinfo.py b/test/export/test_export_opinfo.py
+--- a/test/export/test_export_opinfo.py
++++ b/test/export/test_export_opinfo.py
+@@ -79,7 +79,7 @@ def _test_export_helper(self, dtype, op):
+     mode = FakeTensorMode(allow_non_fake_inputs=True)
+     converter = mode.fake_tensor_converter
+     # intentionally avoid cuda:0 to flush out some bugs
+-    target_device = "cuda:1"
++    target_device = "cuda:0"
+ 
+     def to_fake_device(x):
+         x = converter.from_real_tensor(mode, x)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-hypothesis-deadline.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-hypothesis-deadline.patch
new file mode 100644
index 00000000000..c526ea336c1
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-hypothesis-deadline.patch
@@ -0,0 +1,67 @@
+The assertion at the bottom sometimes fails.
+
+From c4312b443fed1fd8e0e28dfe049ce61226936e99 Mon Sep 17 00:00:00 2001
+From: FFFrog <ljw1101.vip@gmail.com>
+Date: Thu, 25 Sep 2025 16:32:19 +0800
+Subject: [PATCH] [Tools] Adapting the Hypothesis library (version 5.x) for use
+ with the PyTorch framework (#163748)
+
+Starting from version 5.x, the Hypothesis library removed the timeout setting and only retained the deadline.
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/163748
+Approved by: https://github.com/albanD, https://github.com/Skylion007
+---
+ torch/testing/_internal/hypothesis_utils.py | 24 +++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+diff --git a/torch/testing/_internal/hypothesis_utils.py b/torch/testing/_internal/hypothesis_utils.py
+index f02ef4c9e04b0..a00e1e1a048a0 100644
+--- a/torch/testing/_internal/hypothesis_utils.py
++++ b/torch/testing/_internal/hypothesis_utils.py
+@@ -7,6 +7,7 @@
+ 
+ import hypothesis
+ from functools import reduce
++from importlib.metadata import version
+ from hypothesis import assume
+ from hypothesis import settings
+ from hypothesis import strategies as st
+@@ -346,22 +347,33 @@ def tensor_conv(
+ 
+     return X, W, b, groups, tr
+ 
++
+ # We set the deadline in the currently loaded profile.
+ # Creating (and loading) a separate profile overrides any settings the user
+ # already specified.
+-hypothesis_version = hypothesis.version.__version_info__
+-current_settings = settings._profiles[settings._current_profile].__dict__
+-current_settings['deadline'] = None
+-if hypothesis_version >= (3, 16, 0) and hypothesis_version < (5, 0, 0):
+-    current_settings['timeout'] = hypothesis.unlimited
++hypothesis_version = tuple(map(int, version("hypothesis").split(".")[:3]))
++
++if (3, 16, 0) <= hypothesis_version < (3, 27, 0):
++    # Hypothesis 3.16 → 3.26: use `timeout` instead of `deadline`
++    settings.register_profile("no_deadline", timeout=hypothesis.unlimited)
++else:
++    # Hypothesis >=3.27: use `deadline=None`
++    settings.register_profile("no_deadline", deadline=None)
++
++# Activate the profile
++settings.load_profile("no_deadline")
++
++
+ def assert_deadline_disabled():
++    """Check that deadlines are effectively disabled across Hypothesis versions."""
+     if hypothesis_version < (3, 27, 0):
+         import warnings
++
+         warning_message = (
+             "Your version of hypothesis is outdated. "
+             "To avoid `DeadlineExceeded` errors, please update. "
+             f"Current hypothesis version: {hypothesis.__version__}"
+         )
+-        warnings.warn(warning_message)
++        warnings.warn(warning_message, stacklevel=2)
+     else:
+         assert settings().deadline is None
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch
new file mode 100644
index 00000000000..3ff313cbe12
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch
@@ -0,0 +1,17 @@
+Avoid an error caused by modifying dict while iterating it.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/tools/flight_recorder/components/types.py b/tools/flight_recorder/components/types.py
+index 20e093688ba..98192aeb92c 100644
+--- a/tools/flight_recorder/components/types.py
++++ b/tools/flight_recorder/components/types.py
+@@ -164,7 +164,7 @@ class Database(NamedTuple):
+ # TODO: We need to add a schema for the following
+ types = [
+     TypeInfo.from_type(t)  # type: ignore[type-var]
+-    for t in globals().values()
++    for t in list(globals().values())
+     if (
+         isinstance(t, type)
+         and issubclass(t, tuple)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch
new file mode 100644
index 00000000000..fffd633b451
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch
@@ -0,0 +1,62 @@
+The requires_gloo/requires_nccl decorator cause the function to just return.
+In the way they are used this skips the initialization done by a helper function.
+So the test is not skipped and then fails due to missing variables.
+
+Decorate the class instead.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_dist2.py b/test/distributed/test_dist2.py
+index b335eff1c21..ff5a1e8c028 100644
+--- a/test/distributed/test_dist2.py
++++ b/test/distributed/test_dist2.py
+@@ -256,10 +256,10 @@ class Dist2MultiProcessTestCase(MultiProcessTestCase):
+             self.assertEqual(merged_pg.group_name, "merged_pg")
+ 
+ 
++@requires_gloo()
+ class ProcessGroupGlooTest(Dist2MultiProcessTestCase):
+     device = torch.device("cpu")
+ 
+-    @requires_gloo()
+     def new_group(self) -> torch.distributed.ProcessGroup:
+         os.environ["RANK"] = str(self.rank)
+         os.environ["WORLD_SIZE"] = str(self.world_size)
+@@ -273,8 +273,8 @@ class ProcessGroupGlooTest(Dist2MultiProcessTestCase):
+         )
+ 
+ 
++@requires_nccl()
+ class ProcessGroupNCCLTest(Dist2MultiProcessTestCase):
+-    @requires_nccl()
+     @skip_if_lt_x_gpu(2)
+     def new_group(self) -> torch.distributed.ProcessGroup:
+         os.environ["RANK"] = str(self.rank)
+diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
+index c1f75697fe8..d513510d955 100644
+--- a/torch/testing/_internal/common_distributed.py
++++ b/torch/testing/_internal/common_distributed.py
+@@ -331,10 +331,7 @@ def with_dist_debug_levels(levels):
+ 
+ 
+ def requires_gloo():
+-    return skip_but_pass_in_sandcastle_if(
+-        not c10d.is_gloo_available(),
+-        "c10d was not compiled with the Gloo backend",
+-    )
++    return unittest.skipUnless(c10d.is_gloo_available(), "c10d was not compiled with the Gloo backend")
+ 
+ 
+ def requires_nccl_version(version, msg):
+@@ -361,10 +358,7 @@ def requires_nccl_version(version, msg):
+ 
+ 
+ def requires_nccl():
+-    return skip_but_pass_in_sandcastle_if(
+-        not c10d.is_nccl_available(),
+-        "c10d was not compiled with the NCCL backend",
+-    )
++    return unittest.skipUnless(c10d.is_nccl_available(), "c10d was not compiled with the NCCL backend")
+ 
+ 
+ def requires_ucc():
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
new file mode 100644
index 00000000000..cebc1478b59
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch
@@ -0,0 +1,22 @@
+Silence a warning that fails builds with GCC 14, especially in XNNPACK.
+See https://github.com/pytorch/pytorch/pull/166873
+
+Applied more broadly as we don't care about warnings anyway.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -852,6 +852,11 @@ if(MSVC)
+   append_cxx_flag_if_supported("/utf-8" CMAKE_CXX_FLAGS)
+ endif()
+ 
++if(CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "14")
++  string(APPEND CMAKE_C_FLAGS " -Wno-incompatible-pointer-types")
++endif()
++
++
+ # Note for ROCM platform: 1. USE_ROCM is always ON until
+ # include(cmake/Dependencies.cmake) 2. USE_CUDA will become OFF during
+ # re-configuration Truth Table: CUDA 1st pass: USE_CUDA=True;USE_ROCM=True,
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_normalize_tree_output.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_normalize_tree_output.patch
new file mode 100644
index 00000000000..4c708a216cb
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_normalize_tree_output.patch
@@ -0,0 +1,24 @@
+Avoid failure in TestProfilerTree.test_profiler_experimental_tree_with_stack_and_modules
+with diff:
+> - <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
+> + <built-in method _get_tracing_state of pybind11_detail_function_record_v1_system_libstdcpp_gxx_abi_1xxx_use_cxx11_abi_1 object at 0xXXXXXXXXXXXX>
+
+See https://github.com/pytorch/pytorch/pull/174768
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
+index 670e639c98e..e53fd93b273 100644
+--- a/test/profiler/test_profiler_tree.py
++++ b/test/profiler/test_profiler_tree.py
+@@ -240,6 +240,11 @@ class TestProfilerTree(TestCase):
+         # simply coerce them into a platform independent form. If you made a
+         # change in the codebase which changes the trace produced, simply use
+         # EXPECTTEST_ACCEPT=1 to update the tests to reflect the new structure.
++        def normalize(tree):
++            return re.sub(r'of pybind11\w+ object at', 'of PyCapsule object at', tree)
++
++        actual = normalize(actual)
++        expected = normalize(expected)
+ 
+         # expecttest will not show the diff view if `len(actual) < len(expected)`
+         if not expecttest.ACCEPT:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_set-test-timeout.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_set-test-timeout.patch
new file mode 100644
index 00000000000..6bfff62d3d1
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_set-test-timeout.patch
@@ -0,0 +1,19 @@
+Some tests might hang forever and the default timeout will only be set when
+a) --enable-timeout is passed, and
+b) a `.additional_ci_files/test-times.json` exists at the root
+
+Manually set a timeout of 120min which should be enough for any single test.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/run_test.py b/test/run_test.py
+--- a/test/run_test.py
++++ b/test/run_test.py
+@@ -604,6 +604,7 @@ def run_test(
+         if is_cpp_test
+         else None
+     )
++    timeout = 60 * 120
+     print_to_stderr(f"Executing {command} ... [{datetime.now()}]")
+ 
+     with ExitStack() as stack:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-RingFlexAttentionTest.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-RingFlexAttentionTest.patch
new file mode 100644
index 00000000000..7855d55ddaf
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-RingFlexAttentionTest.patch
@@ -0,0 +1,23 @@
+test_ring_flex_attention and test_ring_flex_attention_mask both fail in similar ways:
+
+> torch._dynamo.exc.Unsupported: Attempted to call function marked as skipped
+> ...
+>  Developer debug context: module: _warnings, qualname: warn, skip reason: <missing reason>
+
+See https://github.com/pytorch/pytorch/pull/161667#issuecomment-3298676991 
+    & https://github.com/pytorch/pytorch/issues/162843
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py
+index a2543d443e4..a28fb45e992 100644
+--- a/test/distributed/tensor/test_attention.py
++++ b/test/distributed/tensor/test_attention.py
+@@ -531,6 +531,7 @@ def generate_doc_mask_mod(
+     return doc_mask_mod
+ 
+ 
++@unittest.skip("FAILS")
+ class RingFlexAttentionTest(DTensorTestBase):
+     @property
+     def world_size(self) -> int:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch
new file mode 100644
index 00000000000..5e26591c68c
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch
@@ -0,0 +1,17 @@
+This test shows segfaults, at least on some system.
+PyTorch CI HUD indicates some failures with it are known.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
+index 740faa0b375..ea5e311b7cd 100644
+--- a/test/inductor/test_flex_attention.py
++++ b/test/inductor/test_flex_attention.py
+@@ -3474,6 +3474,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
+         )
+         FileCheck().check("BLOCK_M : tl.constexpr = 16").run(code[0])
+ 
++    @unittest.skip("Segfaults on CPU")
+     @supported_platform
+     def test_block_mask_non_divisible(self, device):
+         seq = torch.arange(1023, device=device) // 128
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch
new file mode 100644
index 00000000000..a6ec831fb1c
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch
@@ -0,0 +1,97 @@
+FlexAttention is only supported on AVX2 CPUs.
+However the tests are run on CPU unconditionally when CUDA devices are available leading to:
+> torch._inductor.exc.InductorError: LoweringException: NotImplementedError: torch.compile on current platform is not supported for CPU.
+
+Add a condition to possibly only add CUDA tests.
+See https://github.com/pytorch/pytorch/pull/174881
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
+index 740faa0b375..e698939d326 100644
+--- a/test/inductor/test_flex_attention.py
++++ b/test/inductor/test_flex_attention.py
+@@ -48,6 +48,9 @@ from torch.testing._internal.common_device_type import (
+     dtypesIfXPU,
+     flex_attention_supported_platform as supported_platform,
+     instantiate_device_type_tests,
++    IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED as TEST_ON_CPU,
++    IS_FLEX_ATTENTION_CUDA_PLATFORM_SUPPORTED as TEST_ON_CUDA,
++    IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED as TEST_ON_XPU,
+     largeTensorTest,
+     skipCPUIf,
+     skipCUDAIf,
+@@ -177,25 +180,21 @@ class DeviceConfig:
+     dtypes_fast: list[torch.dtype]
+ 
+ 
+-TEST_ON_CUDA = (
+-    torch.cuda.is_available()
+-    and torch.utils._triton.has_triton()
+-    and torch.cuda.get_device_capability() >= (8, 0)
+-)
+-TEST_ON_XPU = torch.xpu.is_available() and torch.utils._triton.has_triton()
+-
+ device_configs = {}
++# Tests are skipped when no device is supported, so CPU as default is safe
++test_device = ("cpu",)
+ if HAS_GPU:
+     if TEST_ON_CUDA:
+-        test_device = (
+-            "cuda",
+-            "cpu",
+-        )
++        if TEST_ON_CPU:
++            test_device = (
++                "cuda",
++                "cpu",
++            )
++        else:
++            test_device = ("cuda",)
+     elif TEST_ON_XPU:
+         torch._C._set_onednn_allow_tf32(True)
+         test_device = ("xpu",)
+-else:
+-    test_device = ("cpu",)
+ 
+ 
+ class SubstringSet:
+diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
+index 8971eca1bb2..6b14f9db105 100644
+--- a/torch/testing/_internal/common_device_type.py
++++ b/torch/testing/_internal/common_device_type.py
+@@ -1972,23 +1972,25 @@ def get_all_device_types() -> list[str]:
+ 
+ # skip since currently flex attention requires at least `avx2` support on CPU.
+ IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED = (
+-    not torch.xpu.is_available()
+-    and not torch.cuda.is_available()
+-    and not IS_MACOS
++    not IS_MACOS
+     and torch.cpu._is_avx2_supported()
+     and os.getenv("ATEN_CPU_CAPABILITY") != "default"
+ )
+ IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED = (
+     torch.xpu.is_available() and torch.utils._triton.has_triton()
+ )
++IS_FLEX_ATTENTION_CUDA_PLATFORM_SUPPORTED = (
++    torch.cuda.is_available()
++    and torch.utils._triton.has_triton()
++    and torch.cuda.get_device_capability() >= (8, 0)
++)
+ flex_attention_supported_platform = unittest.skipUnless(
+     IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED
+-    or IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED
+-    or (
+-        torch.cuda.is_available()
+-        and torch.utils._triton.has_triton()
+-        and torch.cuda.get_device_capability() >= (8, 0)
+-    ),
++    or (IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED
++        and not torch.xpu.is_available()
++        and not torch.cuda.is_available()
++        )
++    or IS_FLEX_ATTENTION_CUDA_PLATFORM_SUPPORTED,
+     "Requires CUDA and Triton, Intel GPU and triton, or CPU with avx2 and later",
+ )
+ if torch.version.hip and "gfx94" in torch.cuda.get_device_properties(0).gcnArchName:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch
new file mode 100644
index 00000000000..3c5dd5523dc
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch
@@ -0,0 +1,12 @@
+diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
+index 8c650f6b0ce..04cfa7d4cc2 100644
+--- a/test/distributed/tensor/test_dtensor_ops.py
++++ b/test/distributed/tensor/test_dtensor_ops.py
+@@ -463,6 +463,7 @@ dtensor_fails = {
+     skip("nn.functional.feature_alpha_dropout", "without_train"),
+     skip("nn.functional.hinge_embedding_loss"),
+     skip("nn.functional.cosine_embedding_loss"),
++    skip("nn.functional.multi_head_attention_forward"),  # randomness
+     skip("fft.hfft"),
+     skip("fft.hfft2"),
+     skip("fft.hfft2"),
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-SM90.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-SM90.patch
new file mode 100644
index 00000000000..4dea63b7e5f
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-SM90.patch
@@ -0,0 +1,85 @@
+Avoid test_intra_node_comm_all_reduce failing on e.g. A100:
+
+> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721] RuntimeError: CUDA error: device-side assert triggered...
+> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721]  exiting process 1 with exit code: 10
+> ...
+> :318: st_vec: block: [0,0,0], thread: [87,0,0] Assertion `false` failed.
+> /pytorch-v2.7.1/torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h:318: st_vec: block: [0,0,0], thread: [88,0,0] Assertion `false` failed.
+
+test_fused_all_gather_scaled_matmul fails with a NCCL error due to FP8 usage and hangs forever.
+See https://github.com/pytorch/pytorch/issues/171796
+
+test_fused_scaled_matmul_reduce_scatter fails with
+> RuntimeError: torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+
+
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index 0a0f3ee4ca2..07702566fd8 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -3350,7 +3350,7 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
+     @runOnRocmArch(MI300_ARCH)
+     def test_intra_node_comm_all_reduce(self):
+         from torch._C._distributed_c10d import _get_intra_node_comm_usage_counter
+-        from torch.testing._internal.common_cuda import SM80OrLater
++        from torch.testing._internal.common_cuda import SM90OrLater
+ 
+         for peer in range(self.world_size):
+             if peer == self.rank:
+@@ -3358,8 +3358,8 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
+             if not torch._C._cuda_canDeviceAccessPeer(self.rank, peer):
+                 raise SkipTest("Test requires p2p access")
+ 
+-        if not SM80OrLater:
+-            raise SkipTest("Test requires sm>=80")
++        if not SM90OrLater:
++            raise SkipTest("Test requires sm>=90")
+ 
+         store = c10d.FileStore(self.file_name, self.world_size)
+         os.environ["ENABLE_INTRA_NODE_COMM"] = "1"
+diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
+index eeeb24bec30..9d55b620840 100644
+--- a/test/distributed/test_symmetric_memory.py
++++ b/test/distributed/test_symmetric_memory.py
+@@ -4,7 +4,7 @@ import itertools
+ import os
+ import random
+ from contextlib import nullcontext
+-from unittest import skip, skipIf
++from unittest import skip, skipIf, skipUnless
+ 
+ import torch
+ import torch.distributed as dist
+@@ -22,7 +22,7 @@ from torch.distributed._symmetric_memory import (
+     restride_A_for_fused_matmul_reduce_scatter,
+     restride_A_shard_for_fused_all_gather_matmul,
+ )
+-from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater
++from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater, IS_SM89
+ from torch.testing._internal.common_device_type import e4m3_type
+ from torch.testing._internal.common_distributed import (
+     MultiProcContinuousTest,
+@@ -399,6 +399,10 @@ class AsyncTPTest(MultiProcContinuousTest):
+ 
+     @runOnRocmArch(MI300_ARCH)
+     @skip_if_lt_x_gpu(2)
++    @skipIf(
++        not SM90OrLater,
++        "_fused_all_gather_scaled_matmul_fallback w/ FP8 only supports sm>=90",
++    )
+     @parametrize("gather_dim", [0, 1])
+     @parametrize(
+         "scale_mode", ["tensor-wise", "row-wise-replicated", "row-wise-sharded"]
+@@ -512,6 +516,10 @@ class AsyncTPTest(MultiProcContinuousTest):
+ 
+     @skipIfRocm  # AsyncTP support changed _fused_scaled_matmul_reduce_scatter_fallback API, need more changes
+     @skip_if_lt_x_gpu(2)
++    @skipUnless(
++        SM90OrLater or IS_SM89,
++        "torch._scaled_mm (from fused_scaled_matmul_reduce_scatter) only supports sm>=90 or 8.9",
++    )
+     @parametrize("scatter_dim", [0, 1])
+     @parametrize("rowwise", [True, False])
+     def test_fused_scaled_matmul_reduce_scatter(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-check-cutlass.py b/easybuild/easyconfigs/p/PyTorch/PyTorch-check-cutlass.py
new file mode 100755
index 00000000000..73d9951b78a
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-check-cutlass.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+
+# Verify that PyTorch can load CUTLASS, required for the CUTLASS inductor backend
+# Author: Alexander Grund (TU Dresden)
+
+import os
+import tempfile
+from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass, config
+
+# Isolate from default path used
+os.environ['TORCHINDUCTOR_CACHE_DIR'] = tempfile.mkdtemp(suffix='inductor_cache')
+# Use empty working directory
+os.chdir(tempfile.mkdtemp(suffix='cwd'))
+
+
+if try_import_cutlass():
+    print(f"CUTLASS is set up using {config.cuda.cutlass_dir}")
+else:
+    raise RuntimeError("CUTLASS is NOT working")