diff --git a/easybuild/easyconfigs/c/cuDNN/cuDNN-9.5.1.17-CUDA-12.6.0.eb b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.5.1.17-CUDA-12.6.0.eb
new file mode 100644
index 00000000000..77463d0096b
--- /dev/null
+++ b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.5.1.17-CUDA-12.6.0.eb
@@ -0,0 +1,34 @@
+name = 'cuDNN'
+version = '9.5.1.17'
+versionsuffix = '-CUDA-%(cudaver)s'
+homepage = 'https://developer.nvidia.com/cudnn'
+description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is
+a GPU-accelerated library of primitives for deep neural networks."""
+
+toolchain = SYSTEM
+
+source_urls = [
+    'https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-%(cudnnarch)s/'
+]
+# note: cuDNN is tied to specific to CUDA versions,
+# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions
+sources = ['%(namelower)s-linux-%(cudnnarch)s-%(version)s_cuda%(cudamajver)s-archive.tar.xz']
+checksums = [{
+    '%(namelower)s-linux-sbsa-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
+        '340c49b32c133b0321c5c5b00d14fb64887dcac83ee8fd24195d9191061f1ad7',
+    '%(namelower)s-linux-x86_64-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
+        '35dd20b9c68324ae1288ac36f66ab1f318d2bfecfafb703a82617aa283272be4',
+}]
+
+dependencies = [('CUDA', '12.6.0')]
+
+sanity_check_paths = {
+    'files': [
+        'include/cudnn.h', 'lib64/libcudnn_adv_static.a', 'lib64/libcudnn_cnn_static.a',
+        'lib64/libcudnn_engines_precompiled_static.a', 'lib64/libcudnn_engines_runtime_compiled_static.a',
+        'lib64/libcudnn_graph_static.a', 'lib64/libcudnn_heuristic_static.a', 'lib64/libcudnn_ops_static.a',
+    ],
+    'dirs': ['include', 'lib64'],
+}
+
+moduleclass = 'numlib'
diff --git a/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.6.3.2-CUDA-12.6.0.eb b/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.6.3.2-CUDA-12.6.0.eb
new file mode 100644
index 00000000000..865a0d77716
--- /dev/null
+++ b/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.6.3.2-CUDA-12.6.0.eb
@@ -0,0 +1,32 @@
+easyblock = 'Tarball'
+
+name = 'cuSPARSELt'
+version = '0.6.3.2'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://docs.nvidia.com/cuda/cusparselt/index.html'
+description = """NVIDIA cuSPARSELt is a high-performance CUDA library dedicated to general matrix-matrix operations in
+which at least one operand is a sparse matrix"""
+
+toolchain = SYSTEM
+
+local_arch = {'arm64': 'sbsa', 'aarch64': 'sbsa'}.get(ARCH, ARCH)
+source_urls = ['https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-%s/' % local_arch]
+sources = ['libcusparse_lt-linux-%s-%%(version)s-archive.tar.xz' % local_arch]
+checksums = [{
+    'libcusparse_lt-linux-x86_64-%(version)s-archive.tar.xz':
+        'a2f856e78943f5c538bdef1c9edc64a5ed30bf8bb7d5fcb615c684ffe776cc31',
+    'libcusparse_lt-linux-sbsa-%(version)s-archive.tar.xz':
+        '3e420ddbff4eb9ac603f57c7aa8b3d5271112816e244eb55ef9f30c4eb6a04b7',
+}]
+
+dependencies = [('CUDA', '12.6.0')]
+
+sanity_check_paths = {
+    'files': ['include/cusparseLt.h',
+              'lib/libcusparseLt.%s' % SHLIB_EXT,
+              'lib/libcusparseLt_static.a'],
+    'dirs': [],
+}
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/n/NCCL/NCCL-2.26.2-GCCcore-13.3.0-CUDA-12.6.0.eb b/easybuild/easyconfigs/n/NCCL/NCCL-2.26.2-GCCcore-13.3.0-CUDA-12.6.0.eb
new file mode 100644
index 00000000000..5a167a50e7e
--- /dev/null
+++ b/easybuild/easyconfigs/n/NCCL/NCCL-2.26.2-GCCcore-13.3.0-CUDA-12.6.0.eb
@@ -0,0 +1,26 @@
+name = 'NCCL'
+version = '2.26.2'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://developer.nvidia.com/nccl'
+description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
+communication primitives that are performance optimized for NVIDIA GPUs."""
+
+toolchain = {'name': 'GCCcore', 'version': '13.3.0'}
+
+github_account = 'NVIDIA'
+source_urls = [GITHUB_SOURCE]
+sources = ['v%(version)s-1.tar.gz']
+checksums = ['74c6ab40c864d79c2139508e9419de5970cb406ec85f001d5f834d5f5c0c4f3b']
+
+builddependencies = [('binutils', '2.42')]
+
+dependencies = [
+    ('CUDA', '12.6.0', '', SYSTEM),
+    ('UCX-CUDA', '1.16.0', versionsuffix),
+]
+
+# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
+cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0']
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch
new file mode 100644
index 00000000000..f07706b8d37
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch
@@ -0,0 +1,14 @@
+Avoid tripping on //caffe2/test/cpp/jit:test_custom_class_registrations with IS_SANDCASTLE
+
+Author: Alexander Grund (TU Dresden)
+--- a/torch/testing/_internal/torchbind_impls.py
++++ b/torch/testing/_internal/torchbind_impls.py
+@@ -116,8 +116,6 @@ def load_torchbind_test_lib():
+ 
+     if IS_MACOS:
+         raise unittest.SkipTest("non-portable load_library call used in test")
+-    elif IS_SANDCASTLE or IS_FBCODE:
+-        lib_file_path = Path("//caffe2/test/cpp/jit:test_custom_class_registrations")
+     elif IS_WINDOWS:
+         lib_file_path = find_library_location("torchbind_test.dll")
+     else:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_do-not-checkout-nccl.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_do-not-checkout-nccl.patch
new file mode 100644
index 00000000000..7b0dd830dd3
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_do-not-checkout-nccl.patch
@@ -0,0 +1,36 @@
+Don't checkout NCCL when using system NCCL
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py
+index 5dd5a221975..2b8b868eaa8 100644
+--- a/tools/build_pytorch_libs.py
++++ b/tools/build_pytorch_libs.py
+@@ -7,7 +7,12 @@ from glob import glob
+ from pathlib import Path
+ 
+ from .setup_helpers.cmake import CMake, USE_NINJA
+-from .setup_helpers.env import check_negative_env_flag, IS_64BIT, IS_WINDOWS
++from .setup_helpers.env import (
++    check_env_flag,
++    check_negative_env_flag,
++    IS_64BIT,
++    IS_WINDOWS,
++)
+ 
+ 
+ repo_root = Path(__file__).absolute().parent.parent
+@@ -119,7 +124,12 @@ def build_pytorch(
+     cmake: CMake,
+ ) -> None:
+     my_env = _create_build_env()
+-    checkout_nccl()
++    if (
++        not check_negative_env_flag("USE_CUDA")
++        and not check_negative_env_flag("USE_NCCL")
++        and not check_env_flag("USE_SYSTEM_NCCL")
++    ):
++        checkout_nccl()
+     build_test = not check_negative_env_flag("BUILD_TEST")
+     cmake.generate(
+         version, cmake_python_library, build_python, build_test, my_env, rerun_cmake
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch
new file mode 100644
index 00000000000..8303cdbaa6a
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch
@@ -0,0 +1,34 @@
+If there are no GPUs there would be a WORLD_SIZE=0 which doesn't work.
+Use a positive number for the NCCL/GLOO tests in that case.
+
+See https://github.com/pytorch/pytorch/pull/150764
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/run_test.py b/test/run_test.py
+index a508d8db4d2..e7bbe6ea086 100755
+--- a/test/run_test.py
++++ b/test/run_test.py
+@@ -610,18 +610,19 @@ DISTRIBUTED_TESTS_CONFIG = {}
+ 
+ 
+ if dist.is_available():
++    num_gpus = torch.cuda.device_count()
+     DISTRIBUTED_TESTS_CONFIG["test"] = {"WORLD_SIZE": "1"}
+     if not TEST_WITH_ROCM and dist.is_mpi_available():
+         DISTRIBUTED_TESTS_CONFIG["mpi"] = {
+             "WORLD_SIZE": "3",
+         }
+-    if dist.is_nccl_available():
++    if dist.is_nccl_available() and num_gpus > 0:
+         DISTRIBUTED_TESTS_CONFIG["nccl"] = {
+-            "WORLD_SIZE": f"{torch.cuda.device_count()}",
++            "WORLD_SIZE": f"{num_gpus}",
+         }
+-    if dist.is_gloo_available():
++    if dist.is_gloo_available() and num_gpus > 0:
+         DISTRIBUTED_TESTS_CONFIG["gloo"] = {
+             # TODO: retire testing gloo with CUDA
+-            "WORLD_SIZE": f"{torch.cuda.device_count()}",
++            "WORLD_SIZE": f"{num_gpus}",
+         }
+     # Test with UCC backend is deprecated.
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-skip-decorators.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-skip-decorators.patch
new file mode 100644
index 00000000000..f638ac4f843
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-skip-decorators.patch
@@ -0,0 +1,112 @@
+The decorators are implemented to run when the function is called which is after
+the test `setup` method spawned subprocesses which may use NCCL to sync failing when there are
+not enough GPUs available.
+So replace the custom code by calls to the `unittest` skip decorators.
+See https://github.com/pytorch/pytorch/pull/109491
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
+index d34b1ffdb0a..8f9628f209b 100644
+--- a/torch/testing/_internal/common_distributed.py
++++ b/torch/testing/_internal/common_distributed.py
+@@ -155,17 +155,7 @@ def skip_if_odd_worldsize(func):
+ 
+ 
+ def require_n_gpus_for_nccl_backend(n, backend):
+-    def decorator(func):
+-        @wraps(func)
+-        def wrapper(*args, **kwargs):
+-            if backend == "nccl" and torch.cuda.device_count() < n:
+-                sys.exit(TEST_SKIPS[f"multi-gpu-{n}"].exit_code)
+-            else:
+-                return func(*args, **kwargs)
+-
+-        return wrapper
+-
+-    return decorator
++    return unittest.skipUnless(at_least_x_gpu(n), TEST_SKIPS[f"multi-gpu-{n}"].message) if backend == "nccl" else unittest.skipIf(False, None)
+ 
+ 
+ def import_transformers_or_skip():
+@@ -197,20 +187,10 @@ def at_least_x_gpu(x):
+ 
+ 
+ def skip_if_lt_x_gpu(x):
+-    def decorator(func):
+-        @wraps(func)
+-        def wrapper(*args, **kwargs):
+-            if torch.cuda.is_available() and torch.cuda.device_count() >= x:
+-                return func(*args, **kwargs)
+-            if TEST_HPU and torch.hpu.device_count() >= x:
+-                return func(*args, **kwargs)
+-            if TEST_XPU and torch.xpu.device_count() >= x:
+-                return func(*args, **kwargs)
+-            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+-
+-        return wrapper
+-
+-    return decorator
++    return unittest.skipUnless(torch.cuda.device_count() >= x or (
++                               TEST_HPU and torch.hpu.device_count() >= x) or (
++                               TEST_XPU and torch.xpu.device_count() >= x),
++                               TEST_SKIPS[f"multi-gpu-{x}"].message)
+ 
+ 
+ # This decorator helps avoiding initializing cuda while testing other backends
+diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
+index a4d6d53b975..0da1d9baddf 100644
+--- a/torch/testing/_internal/distributed/distributed_test.py
++++ b/torch/testing/_internal/distributed/distributed_test.py
+@@ -66,7 +66,6 @@ from torch.testing._internal.common_distributed import (
+     skip_if_small_worldsize,
+     skip_if_odd_worldsize,
+     skip_if_lt_x_gpu,
+-    nccl_skip_if_lt_x_gpu,
+     skip_if_no_gpu,
+     require_n_gpus_for_nccl_backend,
+     requires_nccl_version,
+@@ -5299,7 +5298,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(2, BACKEND)
+         def test_accumulate_gradients_no_sync(self):
+             """
+             Runs _test_accumulate_gradients_no_sync using default inputs
+@@ -5310,7 +5309,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(2, BACKEND)
+         def test_accumulate_gradients_no_sync_grad_is_view(self):
+             """
+             Runs _test_accumulate_gradients_no_sync using default inputs
+@@ -5321,7 +5320,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(2, BACKEND)
+         def test_accumulate_gradients_no_sync_allreduce_hook(self):
+             """
+             Runs multiple iterations on _test_accumulate_gradients_no_sync
+@@ -5349,7 +5348,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(2, BACKEND)
+         def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self):
+             """
+             Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce
+@@ -5383,7 +5382,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(2, BACKEND)
+         def test_get_future(self):
+             def mult(fut):
+                 return [t * 3 for t in fut.wait()]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_embedding.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_embedding.patch
new file mode 100644
index 00000000000..d6e02ac25a6
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_embedding.patch
@@ -0,0 +1,36 @@
+TestSelectAlgorithmCPU.test_linear_with_embedding fails when the CPU does not support BF16:
+> torch._inductor.exc.InductorError: LoweringException: RuntimeError: self and mat2 must have the same dtype, but got Float and BFloat16
+See https://github.com/pytorch/pytorch/issues/147104
+
+Convert the embedding layer to avoid it using "Float" and adapt the check for this change.
+
+Author: Alexander Grund (TU Dresden)
+--- a/test/inductor/test_cpu_select_algorithm.py
++++ b/test/inductor/test_cpu_select_algorithm.py
+@@ -932,6 +932,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
+     def test_linear_with_embedding(
+         self, batch_size, in_features, out_features, bias, dtype
+     ):
++        has_bf16 = torch.ops.mkldnn._is_mkldnn_bf16_supported()
+         class M(torch.nn.Module):
+             def __init__(self, bias):
+                 super().__init__()
+@@ -939,6 +940,9 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
+                     dtype=dtype
+                 )
+                 self.emb = torch.nn.Embedding(64, out_features)
++                if not has_bf16:
++                    self.emb = self.emb.to(dtype=dtype)
++
+ 
+             def forward(self, idx, x):
+                 return self.emb(idx) + self.linear(x)
+@@ -953,7 +957,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
+         with verify(dtype) as (atol, rtol):
+             self.common(mod, (idx, x), atol=atol, rtol=rtol)
+         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
+-        self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1)
++        self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1 if has_bf16 else 0)
+ 
+     @inductor_config.patch({"freezing": True})
+     @patches
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch
new file mode 100644
index 00000000000..6067d565e20
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch
@@ -0,0 +1,17 @@
+This test fails when FlexiBLAS is used instead of MKL.
+Adjust the expected count.
+See https://github.com/pytorch/pytorch/pull/151548
+
+Author: Alexander Grund (TU Dresden)
+--- a/test/inductor/test_cpu_select_algorithm.py
++++ b/test/inductor/test_cpu_select_algorithm.py
+@@ -1300,7 +1304,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
+                 rtol=rtol,
+             )
+         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 2)
+-        self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2)
++        self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2 if TEST_MKL else 1)
+ 
+     @inductor_config.patch({"freezing": True})
+     @patches
+
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_skip-test_init_from_local_shards.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_skip-test_init_from_local_shards.patch
new file mode 100644
index 00000000000..942b2758b3b
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_skip-test_init_from_local_shards.patch
@@ -0,0 +1,24 @@
+The test often times out and seems to be considered flaky by PyTorch:
+https://github.com/pytorch/pytorch/issues/78068
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+index 730b2c2c0ac..5f9b9545700 100644
+--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
++++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+@@ -6,6 +6,7 @@ import itertools
+ import math
+ import pickle
+ import sys
++from unittest import skip
+ 
+ import torch
+ import torch.distributed as dist
+@@ -2432,6 +2432,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
+     @with_comms
+     @skip_if_lt_x_gpu(4)
+     @requires_nccl()
++    @skip("Times out often")
+     def test_init_from_local_shards(self):
+         local_shard_metadata = ShardMetadata(
+             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1-foss-2024a-CUDA-12.6.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1-foss-2024a-CUDA-12.6.0.eb
new file mode 100644
index 00000000000..9e9de33ed13
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1-foss-2024a-CUDA-12.6.0.eb
@@ -0,0 +1,291 @@
+name = 'PyTorch'
+version = '2.7.1'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+
+toolchain = {'name': 'foss', 'version': '2024a'}
+
+local_six_version = '1.11.0'
+source_urls = [GITHUB_RELEASE]
+sources = [
+    '%(namelower)s-v%(version)s.tar.gz',
+    {
+        # Avoid downloading this during the build, see third_party/NNPACK/cmake/DownloadSix.cmake for the version
+        'filename': f'six-{local_six_version}.tar.gz',
+        'source_urls':
+        ['https://pypi.python.org/packages/16/d8/bc6316cf98419719bd59c91742194c111b6f2e85abac88e496adefaf7afe'],
+    }
+]
+patches = [
+    'PyTorch-1.12.1_add-hypothesis-suppression.patch',
+    'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
+    'PyTorch-1.7.0_disable-dev-shm-test.patch',
+    'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
+    'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
+    'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
+    'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
+    'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch',
+    'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch',
+    'PyTorch-2.3.0_disable_test_linear_package_if_no_half_types_are_available.patch',
+    'PyTorch-2.3.0_skip_test_var_mean_differentiable.patch',
+    'PyTorch-2.6.0_disable_DataType_dependent_test_if_tensorboard_is_not_available.patch',
+    'PyTorch-2.6.0_disable_tests_which_need_network_download.patch',
+    'PyTorch-2.6.0_disable-gcc12-warnings.patch',
+    'PyTorch-2.6.0_fix-accuracy-issues-in-linalg_solve.patch',
+    'PyTorch-2.6.0_fix-server-in-test_control_plane.patch',
+    'PyTorch-2.6.0_fix-vsx-vector-shift-functions.patch',
+    'PyTorch-2.6.0_increase-tolerance-test_aotdispatch-matmul.patch',
+    'PyTorch-2.6.0_increase-tolerance-test_quick-baddbmm.patch',
+    'PyTorch-2.6.0_increase-tolerance-test_vmap_autograd_grad.patch',
+    'PyTorch-2.6.0_show-test-duration.patch',
+    'PyTorch-2.6.0_skip-diff-test-on-ppc.patch',
+    'PyTorch-2.6.0_skip-test_segfault.patch',
+    'PyTorch-2.6.0_skip-test-requiring-MKL.patch',
+    'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch',
+    'PyTorch-2.7.0_do-not-checkout-nccl.patch',
+    'PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch',
+    'PyTorch-2.7.0_fix-skip-decorators.patch',
+    'PyTorch-2.7.0_fix-test_linear_with_embedding.patch',
+    'PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch',
+    'PyTorch-2.7.0_skip-test_init_from_local_shards.patch',
+    'PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch',
+    'PyTorch-2.7.1_do-not-rpath-stubs-dir-in-jit-code.patch',
+    'PyTorch-2.7.1_exit-test_c10d_ops_nccl-with-succes-when-no-gpu.patch',
+    'PyTorch-2.7.1_fix-cuda-12.6-driver-api-usage.patch',
+    'PyTorch-2.7.1_fix-CUDASymmetricMemory-group.patch',
+    'PyTorch-2.7.1_fix-nccl-test-env.patch',
+    'PyTorch-2.7.1_fix-test_ck_blas_library_cpu.patch',
+    'PyTorch-2.7.1_fix-test_fsdp_ep.patch',
+    'PyTorch-2.7.1_fix-test_ir_count.patch',
+    'PyTorch-2.7.1_fix-test_torchinductor_dynamic-tests.patch',
+    'PyTorch-2.7.1_increase-tolerance-for-sum-reduction-test.patch',
+    'PyTorch-2.7.1_increase-tolerance-for-test_freeze_conv_relu_fusion.patch',
+    'PyTorch-2.7.1_init-cutlass-include-dirs.patch',
+    'PyTorch-2.7.1_keep-CMAKE_PREFIX_PATH-in-test.patch',
+    'PyTorch-2.7.1_remove-faulty-close.patch',
+    'PyTorch-2.7.1_remove-test_close_multi_pg_unordered.patch',
+    'PyTorch-2.7.1_serialize-test_host_memory_stats.patch',
+    'PyTorch-2.7.1_skip-failing-max_autotune-tests.patch',
+    'PyTorch-2.7.1_skip-failing-schedule-test.patch',
+    'PyTorch-2.7.1_skip-NCCL-tests-without-GPUs.patch',
+    'PyTorch-2.7.1_skip-test_benchmark_on_non_zero_device.patch',
+    'PyTorch-2.7.1_skip-test_data_parallel_rnn.patch',
+    'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch',
+    'PyTorch-2.7.1_skip-test_lowering_one_shot_all_reduce.patch',
+    'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch',
+    'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch',
+    'PyTorch-2.7.1_skip-test_override-without-CUDA.patch',
+    'PyTorch-2.7.1_skip-TestFP8Lowering.patch',
+    'PyTorch-2.7.1_skip-tests-requiring-cc89.patch',
+    'PyTorch-2.7.1_skip-tests-requiring-SM90.patch',
+    'PyTorch-2.7.1_suport-64bit-BARs.patch',
+    'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch',
+]
+checksums = [
+    {'pytorch-v2.7.1.tar.gz': '5befd2e540fd55ce4782d0ca7610ce5b572d756d7ea38090ef0f3c7c428fb20f'},
+    {f"six-{local_six_version}.tar.gz": "70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9"},
+    {'PyTorch-1.12.1_add-hypothesis-suppression.patch':
+     'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
+    {'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
+     '72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
+    {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
+    {'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
+     '02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
+    {'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch':
+     '7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'},
+    {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
+     '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
+    {'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
+     '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
+    {'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch':
+     '6298daf9ddaa8542850eee9ea005f28594ab65b1f87af43d8aeca1579a8c4354'},
+    {'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch':
+     'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'},
+    {'PyTorch-2.3.0_disable_test_linear_package_if_no_half_types_are_available.patch':
+     '23416f2d9d5226695ec3fbea0671e3650c655c19deefd3f0f8ddab5afa50f485'},
+    {'PyTorch-2.3.0_skip_test_var_mean_differentiable.patch':
+     '9703fd0f1fca8916f6d79d83e9a7efe8e3f717362a5fdaa8f5d9da90d0c75018'},
+    {'PyTorch-2.6.0_disable_DataType_dependent_test_if_tensorboard_is_not_available.patch':
+     '74db866787f1e666ed3b35db5204f05a0ba8d989fb23057a72dd07928388dc46'},
+    {'PyTorch-2.6.0_disable_tests_which_need_network_download.patch':
+     'fe76129811e4eb24d0e12c397335a4c7971b0c4e48ce9cdb9169f3ef9de7aac4'},
+    {'PyTorch-2.6.0_disable-gcc12-warnings.patch': '892643650788b743106ebe4e70c68be42a756eba797f0f79e31708d6e008a620'},
+    {'PyTorch-2.6.0_fix-accuracy-issues-in-linalg_solve.patch':
+     'a6b1cfe8f03ad5b17437e04e6a0369a25fcc79eed939ce6912ceca1c0ab0f444'},
+    {'PyTorch-2.6.0_fix-server-in-test_control_plane.patch':
+     '1337689ff28ecaa8d1d0edf60d322bcdd7846fec040925325d357b19eb6e4342'},
+    {'PyTorch-2.6.0_fix-vsx-vector-shift-functions.patch':
+     '82ce0b48e3b7c3dfd3a2ba915f4675d5c3a6d149646e1e0d6a29eedbbaecc8bd'},
+    {'PyTorch-2.6.0_increase-tolerance-test_aotdispatch-matmul.patch':
+     'c1c6ea41504e4479d258225ecefc7e9c5726934601610904ae555501a11e9109'},
+    {'PyTorch-2.6.0_increase-tolerance-test_quick-baddbmm.patch':
+     '9850facdfb5d98451249570788217ede07466cae9ba52cd03afd3ec803ba33c9'},
+    {'PyTorch-2.6.0_increase-tolerance-test_vmap_autograd_grad.patch':
+     '8d5eb53bb0a1456af333ae646c860033d6dd037bd9152601a200ca5c10ebf3cb'},
+    {'PyTorch-2.6.0_show-test-duration.patch': '5508f2f9619204d9f3c356dbd4000a00d58f452ab2d64ae920eb8bc8b5484d75'},
+    {'PyTorch-2.6.0_skip-diff-test-on-ppc.patch': '6f2f87cad1b0ab8c5a0c7b3f7fbc14e4bdfbe61da26a3934ded9dda7fe368c74'},
+    {'PyTorch-2.6.0_skip-test_segfault.patch': '26806bd62e6b61b56ebaa52d68ca44c415a28124f684bd2fb373557ada68ef52'},
+    {'PyTorch-2.6.0_skip-test-requiring-MKL.patch': 'f1c9b1c77b09d59317fd52d390e7d948a147325b927ad6373c1fa1d1d6ea1ea8'},
+    {'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch':
+     '2f3255e067f5c6f0d78b4fbce94784c41bddf3d01bab9673856b0d0bbc4e3fec'},
+    {'PyTorch-2.7.0_do-not-checkout-nccl.patch': 'ad085a15dd36768ad33a934f53dc595da745e01697b44d431f8b70ae9d0eb567'},
+    {'PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch':
+     '99d92db44f856b2fb05c221f201e50c21e57a7f6f35824f8274a380875029f24'},
+    {'PyTorch-2.7.0_fix-skip-decorators.patch': 'a5197594f8b076f9a2d03ae3aa725018d55889b737a12b74d6872b5c1bd1e809'},
+    {'PyTorch-2.7.0_fix-test_linear_with_embedding.patch':
+     '276b100a4a405fae6a9517cec1ca166b6f8097668f08f7e20aacf3cb766f9a2a'},
+    {'PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch':
+     '507931ad00afab098ef9df99ac32c28c61c11ca0e0ac2c55570d9b9e7dc8ef38'},
+    {'PyTorch-2.7.0_skip-test_init_from_local_shards.patch':
+     '655e57763c6ddc3d8b52ed67aaf0f59874441a69161d10c14ab8860f9be5c332'},
+    {'PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch':
+     'aaf22cb431357dc78e4db895d64febf1c7ee187e8ad27bd13544d011127354d4'},
+    {'PyTorch-2.7.1_do-not-rpath-stubs-dir-in-jit-code.patch':
+     '6ab92ce23618c74a4950a6dc652d8ea1ff03c101c4f93a9186da29e136b17b1a'},
+    {'PyTorch-2.7.1_exit-test_c10d_ops_nccl-with-succes-when-no-gpu.patch':
+     'ad46a9167ceeafe073618588b2ca13cdef431aae732713b5dc545a93eb9cd076'},
+    {'PyTorch-2.7.1_fix-cuda-12.6-driver-api-usage.patch':
+     '51b2e51ff8419f263f0b9f4352fb503f6f48f2950076f9596b299ff2a0121747'},
+    {'PyTorch-2.7.1_fix-CUDASymmetricMemory-group.patch':
+     '9184b48af7b7caa77f038c911c43cd85f0daa6992f1197adb0ad27b80f5fc40a'},
+    {'PyTorch-2.7.1_fix-nccl-test-env.patch': 'ddb052d217c9811aa2c96e71d52149d2e531b9dfb3b14ca4c7d87d33f54d30cd'},
+    {'PyTorch-2.7.1_fix-test_ck_blas_library_cpu.patch':
+     '9df61b4ed2bd7f4a30df463cd2c5d4cb84d57932909b34dfa360d214425a5fee'},
+    {'PyTorch-2.7.1_fix-test_fsdp_ep.patch': '9cd2da8027e440dd3069fcbd5692703dbfbb9fa9046ebcc5092669a10408b6ef'},
+    {'PyTorch-2.7.1_fix-test_ir_count.patch': 'ba3dc48ee356d48ced89e2d6fceb8c8e91caccd0bda600e4ec4c3540cf434cad'},
+    {'PyTorch-2.7.1_fix-test_torchinductor_dynamic-tests.patch':
+     '1343babbe9fb8a2cc8a12481647340e50f63beffbfa1e92ed4e5a6203f857af4'},
+    {'PyTorch-2.7.1_increase-tolerance-for-sum-reduction-test.patch':
+     '1280259d12a4cf9fcfc22f4b92796072f9ee37b9734c77ebdcbf43d42235d15a'},
+    {'PyTorch-2.7.1_increase-tolerance-for-test_freeze_conv_relu_fusion.patch':
+     'e24f7fa6f43c5ea0fab2c2b876644649948aabae0b2d239e845e20dfd607b7e6'},
+    {'PyTorch-2.7.1_init-cutlass-include-dirs.patch':
+     '682a295519c81afb692caba66eb5e64570f938525e7ded803627884c382d509a'},
+    {'PyTorch-2.7.1_keep-CMAKE_PREFIX_PATH-in-test.patch':
+     '516d0a9a7490999f979eb9e53ae1efd6fdea6ed5f94c9dbd659fb1e5d1fd022b'},
+    {'PyTorch-2.7.1_remove-faulty-close.patch': '315fca3c582534f20da62078156c91b38637f1358cd166b4d33ba964c7b07f95'},
+    {'PyTorch-2.7.1_remove-test_close_multi_pg_unordered.patch':
+     '65a6d430ec359b9fee5f389e05b4c4c592db1ddc12fdab550445b52f3f2a7bfe'},
+    {'PyTorch-2.7.1_serialize-test_host_memory_stats.patch':
+     'ed17602b0458c9d954cfe0c0d7373a2beee13f1ee8eccf3d5f8131980e319ef0'},
+    {'PyTorch-2.7.1_skip-failing-max_autotune-tests.patch':
+     '8611605060088b0178834d34621d407c6ba03803d65e433971f458c05adf0c10'},
+    {'PyTorch-2.7.1_skip-failing-schedule-test.patch':
+     '50151c6792d64c1e01533218b70f0ed974b934334b008e6df558ae8a9b999910'},
+    {'PyTorch-2.7.1_skip-NCCL-tests-without-GPUs.patch':
+     '550c6976b9e3305ceb25cf2de5d135ca771c49acccd2d331c724ade8ccaecde2'},
+    {'PyTorch-2.7.1_skip-test_benchmark_on_non_zero_device.patch':
+     '1c35b207b6fcc24fbfcdc7552fb4f0c9b77233f8a9032a7caa2b8f94d33491d0'},
+    {'PyTorch-2.7.1_skip-test_data_parallel_rnn.patch':
+     'aa85b678e89db4bb41d2c5f4990f0d05959be92e61918291cb5609685b7f1841'},
+    {'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch':
+     '503030c3591196510a3c2d95db30b28a0b396adb8b50ff0d221f6bdb1f939935'},
+    {'PyTorch-2.7.1_skip-test_lowering_one_shot_all_reduce.patch':
+     'c5235fab6cac29adfa61238ddfa71bee18c470e7b3b58f18cc585a1dc3fbeb65'},
+    {'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch':
+     '709288abc802c9eb687c15f2677ebaf408d8325a4cb470d23cb72447ee0b8e13'},
+    {'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch':
+     '4916a256b2b9914e4fdb930681b80df93ea561ddee2fc9978c4973a5650be5e9'},
+    {'PyTorch-2.7.1_skip-test_override-without-CUDA.patch':
+     'a94654b9ba492be1cef0c8f266d1e16e0c5efb35816164f8c2bfdfda2dfa65f5'},
+    {'PyTorch-2.7.1_skip-TestFP8Lowering.patch': 'a1b5d15795d1c776fa7dca9e3eb8c5335d940e6961cb7d9980d1bfe49b847391'},
+    {'PyTorch-2.7.1_skip-tests-requiring-cc89.patch':
+     'ac39e77339196d734837792791baa058732fb2e87180f6007ae5512028b68659'},
+    {'PyTorch-2.7.1_skip-tests-requiring-SM90.patch':
+     '7b5891a96b58d1d404c130233ec5ddbb0ad52afdb9c334bbe4e1f27f6c78ffd8'},
+    {'PyTorch-2.7.1_suport-64bit-BARs.patch': '317c3d220aa87426d86e137a6c1a8f910adf9580ca0848371e0f6800c05dbde1'},
+    {'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch':
+     'f304440a57e00b8052a5ffbf285adad8d0fdc5a812a659420b59a20deb5a9942'},
+]
+
+osdependencies = [OS_PKG_IBVERBS_DEV]
+
+builddependencies = [
+    ('CMake', '3.29.3'),
+    ('hypothesis', '6.103.1'),
+    ('pybind11', '2.12.0'),
+    # For tests
+    ('parameterized', '0.9.0'),
+    ('pytest-flakefinder', '1.1.0'),
+    ('pytest-rerunfailures', '15.0'),
+    ('pytest-shard', '0.1.2'),
+    ('pytest-subtests', '0.13.1'),
+    ('tlparse', '0.3.37'),
+    ('optree', '0.14.1'),
+    ('unittest-xml-reporting', '3.1.0'),
+]
+
+dependencies = [
+    ('CUDA', '12.6.0', '', SYSTEM),
+    # PyTorch is very sensitive to the NCCL & cuDNN versions. (Maybe the same for cuSPARSELt)
+    # Prefer those (listed per CUDA version) in
+    # https://github.com/pytorch/pytorch/blob/main/.github/scripts/generate_binary_build_matrix.py
+    # or https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_cuda.sh
+    ('NCCL', '2.26.2', versionsuffix),
+    ('cuDNN', '9.5.1.17', versionsuffix, SYSTEM),
+    ('magma', '2.9.0', versionsuffix),
+    ('cuSPARSELt', '0.6.3.2', versionsuffix, SYSTEM),
+    ('nvidia-cutlass', '3.8.0.0', versionsuffix),
+    # Version from .ci/docker/triton_version.txt
+    ('Triton', '3.3.1', versionsuffix),
+    ('Ninja', '1.12.1'),  # Required for JIT compilation of C++ extensions
+    ('Python', '3.12.3'),
+    ('Python-bundle-PyPI', '2024.06'),
+    ('expecttest', '0.2.1'),
+    ('GMP', '6.3.0'),
+    ('MPFR', '4.2.1'),
+    ('networkx', '3.4.2'),
+    ('numactl', '2.0.18'),
+    ('Pillow', '10.4.0'),
+    ('protobuf-python', '5.28.0'),
+    ('protobuf', '28.0'),
+    ('PuLP', '2.8.0'),
+    ('PyYAML', '6.0.2'),
+    ('pyzstd', '0.16.2'),
+    ('SciPy-bundle', '2024.05'),
+    ('sympy', '1.13.3'),
+    ('Z3', '4.13.0',),
+]
+
+buildcmd = '%(python)s setup.py build'  # Run the (long) build in the build step
+
+excluded_tests = {
+    '': [
+        # This test seems to take too long on NVIDIA Ampere at least.
+        'distributed/test_distributed_spawn',
+        # no xdoctest
+        'doctests',
+        # intermittent failures on various systems
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'distributed/rpc/test_tensorpipe_agent',
+        # This test is expected to fail when run in their CI, but won't in our case.
+        # It just checks for a "CI" env variable
+        'test_ci_sanity_check_fail',
+        # Requires pwlf Python package
+        'distributed/_tools/test_sac_ilp', 'distributed/_tools/test_sac_estimator',
+        # 9 failures in H100, 7 are present in PYPI package, 2 are related to GC in Python < 3.12.4
+        'dynamo/test_dynamic_shapes',
+        # Broken test: https://github.com/pytorch/pytorch/issues/162179
+        'distributed/_composable/fsdp/test_fully_shard_logging',
+        # Broken: https://github.com/pytorch/pytorch/issues/137027
+        'inductor/test_extension_backend',
+        # Requires optional Python packages
+        'test_public_bindings',
+        # 1 Failure and not important
+        'dynamo/test_utils',
+    ]
+}
+
+local_test_opts = '--continue-through-error --pipe-logs --verbose %(excluded_tests)s'
+runtest = 'cd test && PYTEST_ADDOPTS=--full-trace PYTHONUNBUFFERED=1 %(python)s run_test.py ' + local_test_opts
+
+# ATTENTION: NVIDIA Volta not (fully) supported anymore.
+# Allow some more tests to fail, especially due to that
+max_failed_tests = 60
+
+tests = ['PyTorch-check-cpp-extension.py']
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch
new file mode 100644
index 00000000000..bb3103160a7
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch
@@ -0,0 +1,18 @@
+"//caffe2/test/inductor:custom_ops" is a FB-specific "library" which we pull in by setting IS_SANDCASTLE causing
+> OSError: /caffe2/test/inductor:custom_ops: cannot open shared object file: No such file or directory
+in inductor/test_aot_inductor_custom_ops.py
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/inductor/test_aot_inductor_custom_ops.py b/test/inductor/test_aot_inductor_custom_ops.py
+index ce2ef3739d3..7b9dc4792fd 100644
+--- a/test/inductor/test_aot_inductor_custom_ops.py
++++ b/test/inductor/test_aot_inductor_custom_ops.py
+@@ -380,7 +380,7 @@ common_utils.instantiate_parametrized_tests(AOTInductorTestsTemplate)
+ 
+ class AOTICustomOpTestCase(TestCase):
+     def setUp(self):
+-        if IS_SANDCASTLE or IS_FBCODE:
++        if False:
+             torch.ops.load_library("//caffe2/test/inductor:custom_ops")
+         elif IS_MACOS:
+             raise unittest.SkipTest("non-portable load_library call used in test")
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_do-not-rpath-stubs-dir-in-jit-code.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_do-not-rpath-stubs-dir-in-jit-code.patch
new file mode 100644
index 00000000000..f03c76c397a
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_do-not-rpath-stubs-dir-in-jit-code.patch
@@ -0,0 +1,30 @@
+JIT generated code fails at runtime to call any CUDA function because it rpathes our stubs library.
+See https://github.com/pytorch/pytorch/pull/160179
+
+Errors look like
+> cutlass_library/source/tools/util/include/cutlass/util/device_memory.h:67  cutlass::device_memory::allocate: cudaMalloc failed: bytes=4096
+> terminate called after throwing an instance of 'cutlass::cuda_exception'
+>  what():  std::exception
+
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
+index dc9f5c25365..432b445b4ac 100644
+--- a/torch/_inductor/codecache.py
++++ b/torch/_inductor/codecache.py
+@@ -2848,9 +2852,13 @@ def _cuda_lib_options() -> list[str]:
+     if is_linux():
+         _transform_cuda_paths(lpaths)
+         for path in lpaths:
++            extra_ldflags.append(f"-L{path}")
+             # -rpath ensures the DLL can find its dependencies when loaded, even
+             # if the library path is non-standard.
+-            extra_ldflags.extend([f"-L{path}", "-Xlinker", f"-rpath={path}"])
++            # But do not add the stubs folder to rpath as the driver is expected to be found at runtime
++            if os.path.basename(path) != "stubs":
++                extra_ldflags.extend(["-Xlinker", f"-rpath={path}"])
++
+         extra_ldflags.append("-lcuda")
+         extra_ldflags.append("-lcudart")
+     else:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_exit-test_c10d_ops_nccl-with-succes-when-no-gpu.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_exit-test_c10d_ops_nccl-with-succes-when-no-gpu.patch
new file mode 100644
index 00000000000..97b18fadd80
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_exit-test_c10d_ops_nccl-with-succes-when-no-gpu.patch
@@ -0,0 +1,20 @@
+When the test file is run without GPUs it will exit with a non-zero code
+which causes it to be considered a failure.
+Exit with code 0 instead when no GPUs are available.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_c10d_ops_nccl.py b/test/distributed/test_c10d_ops_nccl.py
+index 73bad39956c..0d8d3ba9628 100644
+--- a/test/distributed/test_c10d_ops_nccl.py
++++ b/test/distributed/test_c10d_ops_nccl.py
+@@ -982,7 +982,8 @@ class ProcessGroupNCCLOpTest(MultiProcContinousTest):
+ 
+ if __name__ == "__main__":
+     if not torch.cuda.is_available():
+-        sys.exit(TEST_SKIPS["no_cuda"].exit_code)
++        print(TEST_SKIPS["no_cuda"].message)
++        sys.exit(0)
+ 
+     rank = int(os.getenv("RANK", -1))
+     world_size = int(os.getenv("WORLD_SIZE", -1))
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-CUDASymmetricMemory-group.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-CUDASymmetricMemory-group.patch
new file mode 100644
index 00000000000..eb788b47bb2
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-CUDASymmetricMemory-group.patch
@@ -0,0 +1,32 @@
+Fix test test/distributed/test_c10d_nccl.py CommTest.test_intra_node_comm_all_reduce failing with
+> RuntimeError: get_group_info: no group info associated with the group name 
+
+From 9108d153ce49fc31c1e8d71640e19b0dcd159dcc Mon Sep 17 00:00:00 2001
+From: eqy <eddiey@nvidia.com>
+Date: Wed, 26 Mar 2025 03:59:43 +0000
+Subject: [PATCH] [CUDA]][SymmetricMemory] Interpret empty string as
+ `std::nullopt` in `rendezvous` (#149793)
+
+this is a "temporary" fix as current internal API requires strings at some interfaces instead of `std::optional` and empty strings are presumably used in-lieu of `nullopt`.
+e.g.,
+https://github.com/pytorch/pytorch/blob/9d02b3993f7dae7fa3379d5190ac88291ecd4dce/torch/csrc/distributed/c10d/intra_node_comm.cu#L49
+
+this currently breaks `test_intra_node_comm_all_reduce`
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/149793
+
+diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu
+index 172304479e9e..721d2c815875 100644
+--- a/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu
++++ b/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu
+@@ -784,7 +784,9 @@ c10::intrusive_ptr<SymmetricMemory> CUDASymmetricMemoryAllocator::rendezvous(
+   // The group_name passed to rendezvous() takes precedence over
+   // the default group_name specified during allocation.
+   std::string group_name_;
+-  if (group_name.has_value()) {
++  // Treat empty string and std::nullopt the same as empty string seems to be
++  // implicitly used that way
++  if (group_name.has_value() && group_name != "") {
+     group_name_ = *group_name;
+   } else {
+     if (!block->default_group_name.has_value()) {
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-cuda-12.6-driver-api-usage.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-cuda-12.6-driver-api-usage.patch
new file mode 100644
index 00000000000..7b99e0cce32
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-cuda-12.6-driver-api-usage.patch
@@ -0,0 +1,133 @@
+Avoid "RuntimeError: CUDA driver error: operation not supported" due to use of CUDA 12.6 in e.g. 
+> python distributed/test_symmetric_memory.py SymmMemSingleProcTest.test_stream_write_value32
+
+Backport of https://github.com/pytorch/pytorch/commit/cf90c9f8d1632777ec5f4b6ccaa14bc5bf259e9c
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp
+index bb201b5c039..56e3ffb02ab 100644
+--- a/c10/cuda/driver_api.cpp
++++ b/c10/cuda/driver_api.cpp
+@@ -2,6 +2,7 @@
+ #include <c10/cuda/driver_api.h>
+ #include <c10/util/CallOnce.h>
+ #include <c10/util/Exception.h>
++#include <cuda_runtime.h>
+ #include <dlfcn.h>
+ 
+ namespace c10::cuda {
+@@ -9,20 +10,13 @@ namespace c10::cuda {
+ namespace {
+ 
+ DriverAPI create_driver_api() {
+-  void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
+-  TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror());
+   void* handle_1 = DriverAPI::get_nvml_handle();
+   DriverAPI r{};
+ 
+-#define LOOKUP_LIBCUDA_ENTRY(name)                       \
+-  r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
+-  TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
++#define LOOKUP_LIBCUDA_ENTRY(name)                                  \
++  r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name)); \
++  TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name)
+   C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY)
+-#undef LOOKUP_LIBCUDA_ENTRY
+-
+-#define LOOKUP_LIBCUDA_ENTRY(name)                       \
+-  r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
+-  dlerror();
+   C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY)
+ #undef LOOKUP_LIBCUDA_ENTRY
+ 
+@@ -47,6 +41,52 @@ C10_EXPORT DriverAPI* DriverAPI::get() {
+   return &singleton;
+ }
+ 
++typedef cudaError_t (*VersionedGetEntryPoint)(
++    const char*,
++    void**,
++    unsigned int,
++    unsigned long long, // NOLINT(*)
++    cudaDriverEntryPointQueryResult*);
++typedef cudaError_t (*GetEntryPoint)(
++    const char*,
++    void**,
++    unsigned long long, // NOLINT(*)
++    cudaDriverEntryPointQueryResult*);
++
++void* get_symbol(const char* symbol, int cuda_version) {
++  // We link to the libcudart.so already, so can search for it in the current
++  // context
++  static GetEntryPoint driver_entrypoint_fun = reinterpret_cast<GetEntryPoint>(
++      dlsym(RTLD_DEFAULT, "cudaGetDriverEntryPoint"));
++  static VersionedGetEntryPoint driver_entrypoint_versioned_fun =
++      reinterpret_cast<VersionedGetEntryPoint>(
++          dlsym(RTLD_DEFAULT, "cudaGetDriverEntryPointByVersion"));
++
++  cudaDriverEntryPointQueryResult driver_result{};
++  void* entry_point = nullptr;
++  if (driver_entrypoint_versioned_fun != nullptr) {
++    // Found versioned entrypoint function
++    cudaError_t result = driver_entrypoint_versioned_fun(
++        symbol, &entry_point, cuda_version, cudaEnableDefault, &driver_result);
++    TORCH_CHECK(
++        result == cudaSuccess,
++        "Error calling cudaGetDriverEntryPointByVersion");
++  } else {
++    TORCH_CHECK(
++        driver_entrypoint_fun != nullptr,
++        "Error finding the CUDA Runtime-Driver interop.");
++    // Versioned entrypoint function not found
++    cudaError_t result = driver_entrypoint_fun(
++        symbol, &entry_point, cudaEnableDefault, &driver_result);
++    TORCH_CHECK(result == cudaSuccess, "Error calling cudaGetDriverEntryPoint");
++  }
++  TORCH_CHECK(
++      driver_result == cudaDriverEntryPointSuccess,
++      "Could not find CUDA driver entry point for ",
++      symbol);
++  return entry_point;
++}
++
+ } // namespace c10::cuda
+ 
+ #endif
+diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h
+index 65cbdfe878d..1a1f0108e69 100644
+--- a/c10/cuda/driver_api.h
++++ b/c10/cuda/driver_api.h
+@@ -3,6 +3,12 @@
+ #define NVML_NO_UNVERSIONED_FUNC_DEFS
+ #include <nvml.h>
+ 
++#if defined(CUDA_VERSION)
++#define DEFAULT_CUDA_VERSION CUDA_VERSION
++#else
++#define DEFAULT_CUDA_VERSION 11080
++#endif
++
+ #define C10_CUDA_DRIVER_CHECK(EXPR)                                        \
+   do {                                                                     \
+     CUresult __err = EXPR;                                                 \
+@@ -62,4 +68,7 @@ struct DriverAPI {
+   static void* get_nvml_handle();
+ };
+ 
++/*! \brief Get pointer corresponding to symbol in CUDA driver library */
++void* get_symbol(const char* symbol, int cuda_version = DEFAULT_CUDA_VERSION);
++
+ } // namespace c10::cuda
+diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu
+index 438624f4bc0..992e415db1b 100644
+--- a/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu
++++ b/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu
+@@ -6,6 +6,7 @@
+ 
+ #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+ #include <c10/cuda/driver_api.h>
++#include <cudaTypedefs.h>
+ #endif
+ 
+ #if defined(CUDART_VERSION) && CUDART_VERSION >= 12030
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-nccl-test-env.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-nccl-test-env.patch
new file mode 100644
index 00000000000..92f5af4fecb
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-nccl-test-env.patch
@@ -0,0 +1,44 @@
+Avoid failures like
+>  Error : no algorithm/protocol available for function Broadcast with datatype ncclInt8. NCCL_ALGO was set to NVLS.
+
+See https://github.com/pytorch/pytorch/pull/163063
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index f5a35c98d13..734afc17c69 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -3163,19 +3163,24 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
+ class NcclUserBufferRegistrationTest(MultiProcessTestCase):
+     def setUp(self):
+         super().setUp()
+-        # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
+-        # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
+-        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+         nccl_debug_file = tempfile.NamedTemporaryFile()
+-        os.environ["NCCL_ALGO"] = "NVLS"
+-        os.environ["NCCL_DEBUG"] = "INFO"
+-        os.environ["NCCL_DEBUG_SUBSYS"] = "NVLS"
++        nccl_env = {
++            # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
++            # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
++            "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1",
++            "NCCL_ALGO": "NVLS",
++            "NCCL_DEBUG": "INFO",
++            "NCCL_DEBUG_SUBSYS": "NVLS",
++            "NCCL_DEBUG_FILE": nccl_debug_file.name,
++        }
+         if torch.cuda.nccl.version() >= (2, 24, 3):
+-            os.environ["NCCL_DEBUG_SUBSYS"] = "REG"
+-        os.environ["NCCL_DEBUG_FILE"] = nccl_debug_file.name
++            nccl_env["NCCL_DEBUG_SUBSYS"] = "REG"
++        self.env_patcher = mock.patch.dict(os.environ, nccl_env)
++        self.env_patcher.start()
+         self._spawn_processes()
+ 
+     def tearDown(self):
++        self.env_patcher.stop()
+         super().tearDown()
+         try:
+             os.remove(self.file_name)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ck_blas_library_cpu.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ck_blas_library_cpu.patch
new file mode 100644
index 00000000000..96a8cceaaae
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ck_blas_library_cpu.patch
@@ -0,0 +1,58 @@
+From 8cdb9adc05d77bb1d65fc233b780860b893c8d17 Mon Sep 17 00:00:00 2001
+From: Yichen Yan <oraluben@outlook.com>
+Date: Mon, 17 Mar 2025 17:45:45 +0000
+Subject: [PATCH] do not run `test_ck_blas_library` on cpu (#148316)
+
+Fix on non-rocm:
+
+```
+root@e01-tw-ue5g2g3sap6:~/pytorch/test# python test_linalg.py TestLinalgCPU.test_ck_blas_library_cpu
+E
+======================================================================
+ERROR: test_ck_blas_library_cpu (__main__.TestLinalgCPU)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/root/pytorch/torch/testing/_internal/common_utils.py", line 3108, in wrapper
+    method(*args, **kwargs)
+  File "/root/pytorch/torch/testing/_internal/common_device_type.py", line 480, in instantiated_test
+    raise rte
+  File "/root/pytorch/torch/testing/_internal/common_device_type.py", line 460, in instantiated_test
+    result = test(self, **param_kwargs)
+  File "/root/pytorch/torch/testing/_internal/common_device_type.py", line 1242, in dep_fn
+    return fn(slf, *args, **kwargs)
+  File "/root/pytorch/torch/testing/_internal/common_utils.py", line 1981, in _fn
+    fn(*args, **kwargs)
+  File "/root/pytorch/test/test_linalg.py", line 8621, in test_ck_blas_library
+    torch.backends.cuda.preferred_blas_library('ck')
+  File "/root/pytorch/torch/backends/cuda/__init__.py", line 258, in preferred_blas_library
+    torch._C._set_blas_preferred_backend(_BlasBackends[backend])
+RuntimeError: Cannot set preferred backend to Ck if PyTorch has not been compiled for ROCm.
+
+To execute this test, run the following from the base repo dir:
+    python test/test_linalg.py TestLinalgCPU.test_ck_blas_library_cpu
+
+This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+
+----------------------------------------------------------------------
+Ran 1 test in 0.346s
+
+FAILED (errors=1)
+```
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/148316
+Approved by: https://github.com/jeffdaily
+---
+ test/test_linalg.py | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/test/test_linalg.py b/test/test_linalg.py
+index caa25cb13dde..0cece1a1e1a6 100644
+--- a/test/test_linalg.py
++++ b/test/test_linalg.py
+@@ -9127,6 +9127,7 @@ def test_preferred_blas_library(self):
+         self.assertEqual(out1, out2)
+         self.assertEqual(out_ref, out2.cpu())
+ 
++    @onlyCUDA
+     @skipCUDAIfNotRocm
+     @unittest.skipIf(not blaslt_supported_device(), "blasLt not supported on current device")
+     @setBlasBackendsToDefaultFinally
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_fsdp_ep.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_fsdp_ep.patch
new file mode 100644
index 00000000000..4f3461fd289
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_fsdp_ep.patch
@@ -0,0 +1,38 @@
+From 9e07673deb212c87b1c6fea23799a97474c476ed Mon Sep 17 00:00:00 2001
+From: Kanya-Mo <167922169+Kanya-Mo@users.noreply.github.com>
+Date: Fri, 8 Aug 2025 22:36:42 +0000
+Subject: [PATCH] Fix test_fsdp_ep.py due to _MeshEnv API change (#158695)
+
+#132339 changed parent/child mesh related APIs from _MeshEnv. UT TestFSDPWithEP.test_e2e still uses old APIs and will fail:
+```
+File "/home/kanya/pytorch/test/distributed/checkpoint/e2e/test_fsdp_ep.py", line 77, in test_e2e
+    mesh_fsdp_ep = _mesh_resources.create_child_mesh(mesh_fsdp_tp, ("dp",))
+AttributeError: '_MeshEnv' object has no attribute 'create_child_mesh'
+
+To execute this test, run the following from the base repo dir:
+    python test/distributed/checkpoint/e2e/test_fsdp_ep.py TestFSDPWithEP.test_e2e
+
+This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0. Did you mean: 'create_sub_mesh'?
+```
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/158695
+Approved by: https://github.com/Skylion007, https://github.com/nWEIdia
+---
+ test/distributed/checkpoint/e2e/test_fsdp_ep.py | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/test/distributed/checkpoint/e2e/test_fsdp_ep.py b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
+index 7489317035b9..51d4b3e99537 100644
+--- a/test/distributed/checkpoint/e2e/test_fsdp_ep.py
++++ b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
+@@ -73,8 +73,8 @@ def test_e2e(self):
+             self.device_type, (2, 4), mesh_dim_names=("dp", "tp")
+         )
+         # TODO: we are using an internal API atm. Change to a publich API once it is ready.
+-        mesh_fsdp_ep = _mesh_resources.create_child_mesh(mesh_fsdp_tp, ("dp",))
+-        del _mesh_resources.child_to_parent_mapping[mesh_fsdp_ep]
++        mesh_fsdp_ep = _mesh_resources.create_sub_mesh(mesh_fsdp_tp, ("dp",), [(0,)])
++        del _mesh_resources.child_to_root_mapping[mesh_fsdp_ep]
+ 
+         mesh_fsdp = init_device_mesh(self.device_type, (8,))
+         for i, l in enumerate(model.second.ep_layers):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ir_count.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ir_count.patch
new file mode 100644
index 00000000000..b595cc05902
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ir_count.patch
@@ -0,0 +1,26 @@
+From acd0873d3b3378420fd81dbf68b31f503219e524 Mon Sep 17 00:00:00 2001
+From: Nikita Shulga <nshulga@meta.com>
+Date: Fri, 23 May 2025 13:04:47 -0700
+Subject: [PATCH] [CI] Fix `TestDynamoTimed.test_ir_count` for 3.12 (#154268)
+
+Python-3.12 emits the same bytecode as 3.13 for code in question
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/154268
+Approved by: https://github.com/clee2000, https://github.com/atalman
+ghstack dependencies: #154237
+---
+ test/dynamo/test_utils.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py
+index 595e9dc02fd3..b20713d9ecf4 100644
+--- a/test/dynamo/test_utils.py
++++ b/test/dynamo/test_utils.py
+@@ -481,7 +481,7 @@ def test_ir_count(self):
+             (3, 9): (10, 6),
+             (3, 10): (10, 6),
+             (3, 11): (10, 6),
+-            (3, 12): (10, 6),
++            (3, 12): (11, 7),
+             (3, 13): (11, 7),
+         }[version]
+ 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_torchinductor_dynamic-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_torchinductor_dynamic-tests.patch
new file mode 100644
index 00000000000..2733bad2729
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_torchinductor_dynamic-tests.patch
@@ -0,0 +1,93 @@
+Fix failures in test_torchinductor_dynamic_shapes which disappear when running the test individually.
+> RuntimeError: Tried to register an operator (test::foo(Tensor x) -> (Tensor, Tensor)) with the same name and overload name multiple times. Each overload's schema should only be registered with a single call to def(). Duplicate registration: registered at /dev/null:203. Original registration: registered at /dev/null:488
+
+Last one doesn't only fail on ROCM with:
+> AssertionError: expected to fail, but actually passed
+
+
+See https://github.com/pytorch/pytorch/issues/154216
+
+Author: Alexander Grund (TU Dresden)
+
+--- a/test/inductor/test_torchinductor_dynamic_shapes.py
++++ b/test/inductor/test_torchinductor_dynamic_shapes.py
+@@ -367,7 +367,9 @@ class TestInductorDynamic(TestCase):
+     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+     @torch._inductor.config.patch(implicit_fallbacks=True)
+     def test_item_to_inputs_kernel_nobreak(self, device):
+-        @torch.library.custom_op("test::foo", mutates_args=())
++        @torch.library.custom_op(
++            "test_item_to_inputs_kernel_nobreak::foo", mutates_args=()
++        )
+         def foo(x: torch.Tensor, y: int) -> torch.Tensor:
+             return x.clone()
+ 
+@@ -378,7 +380,7 @@ class TestInductorDynamic(TestCase):
+         @torch.compile(fullgraph=True)
+         def f(x, r):
+             y = x.item()
+-            return torch.ops.test.foo(r, y)
++            return torch.ops.test_item_to_inputs_kernel_nobreak.foo(r, y)
+ 
+         f(torch.tensor([3], device=device), torch.randn(10, device=device))
+ 
+@@ -440,11 +442,13 @@ class TestInductorDynamic(TestCase):
+     )
+     @torch._inductor.config.patch(implicit_fallbacks=True)
+     def test_unbacked_save_for_backwards(self, device) -> None:
+-        @torch.library.custom_op("_test::_cat", mutates_args=())
++        @torch.library.custom_op(
++            "test_unbacked_save_for_backwards::_cat", mutates_args=()
++        )
+         def _cat(t: torch.Tensor, ds: list[int]) -> torch.Tensor:
+             return t * t.new_ones([sum(ds)])
+ 
+-        @torch.library.register_fake("_test::_cat")
++        @torch.library.register_fake("test_unbacked_save_for_backwards::_cat")
+         def _cat_fake(t: torch.Tensor, ds: list[int]) -> torch.Tensor:
+             [torch._check_is_size(d) for d in ds]
+             return t.new_empty([sum(ds)])
+@@ -456,13 +460,13 @@ class TestInductorDynamic(TestCase):
+             return grad.sum(), None
+ 
+         torch.library.register_autograd(
+-            "_test::_cat",
++            "test_unbacked_save_for_backwards::_cat",
+             _cat_backward,
+             setup_context=_cat_setup_context,
+         )
+ 
+         def fn(t, sizes):
+-            r = torch.ops._test._cat(t, sizes.tolist())
++            r = torch.ops.test_unbacked_save_for_backwards._cat(t, sizes.tolist())
+             return r * t
+ 
+         t = torch.randn((), requires_grad=True, device=device)
+@@ -476,6 +476,7 @@ class TestInductorDynamic(TestCase):
+         ).sum().backward()
+         self.assertEqual(t.grad, expect)
+ 
++    @unittest.skip("Fails on CPU")
+     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+     def test_unbacked_reduction(self, device):
+         expect_fail = (
+@@ -591,7 +595,9 @@ class TestInductorDynamic(TestCase):
+     )
+     @torch._inductor.config.patch(implicit_fallbacks=True)
+     def test_multi_output_unbacked_custom_op(self, device):
+-        @torch.library.custom_op("test::foo", mutates_args=())
++        @torch.library.custom_op(
++            "test_multi_output_unbacked_custom_op::foo", mutates_args=()
++        )
+         def foo(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+             return torch.empty(2, device=x.device), torch.empty(3, device=x.device)
+ 
+@@ -603,7 +609,7 @@ class TestInductorDynamic(TestCase):
+ 
+         @torch.compile(fullgraph=True)
+         def f(x):
+-            a, b = torch.ops.test.foo(x)
++            a, b = torch.ops.test_multi_output_unbacked_custom_op.foo(x)
+             return a.sum() + b.sum()
+ 
+         f(torch.tensor([3], device=device))
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-sum-reduction-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-sum-reduction-test.patch
new file mode 100644
index 00000000000..431c34aadb7
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-sum-reduction-test.patch
@@ -0,0 +1,53 @@
+Fix accuracy issues with at least A100 GPUs for sum reduction tests
+See https://github.com/pytorch/pytorch/issues/164249
+
+> FAIL [1.381s]: test_reduction_fns_name_sum_float16 (__main__.CooperativeReductionTests.test_reduction_fns_name_sum_float16)
+> Greatest absolute difference: 0.125 at index (0,) (up to 1e-05 allowed)
+> Greatest relative difference: 0.0017375946044921875 at index (0,) (up to 0.001 allowed)
+
+FAIL [1.290s]: test_reduction_fns_name_sum_float32 (__main__.CooperativeReductionTests.test_reduction_fns_name_sum_float32)
+> Greatest absolute difference: 0.000213623046875 at index (0,) (up to 1e-05 allowed)
+> Greatest relative difference: 2.9593741146527464e-06 at index (0,) (up to 1.3e-06 allowed)
+
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py
+index 469ceec2e1b..07adc0e7e7e 100644
+--- a/test/inductor/test_cooperative_reductions.py
++++ b/test/inductor/test_cooperative_reductions.py
+@@ -57,11 +57,11 @@ class CooperativeReductionTests(TestCase):
+         torch._inductor.metrics.generated_kernel_count = 0
+         torch._dynamo.reset()
+ 
+-    def run_and_check(self, fn, args, *, expect_kernel_count=1):
++    def run_and_check(self, fn, args, *, expect_kernel_count=1, atol=None, rtol=None):
+         expected = fn(*args)
+         fn = torch.compile(fn, fullgraph=True)
+         result, (source_code,) = run_and_get_code(fn, *args)
+-        self.assertEqual(result, expected)
++        self.assertEqual(result, expected, atol=atol, rtol=rtol)
+         if "@triton_heuristics.fixed_config" in source_code:
+             self.assertIn("cooperative_reduction_grid", source_code)
+         else:
+@@ -91,13 +91,19 @@ class CooperativeReductionTests(TestCase):
+     def test_reduction_fns(self, name, dtype):
+         if IS_SM89 and dtype == torch.float64 and name in ["std", "var_mean"]:
+             raise unittest.SkipTest("Timeouts on SM89")
++        if name == "sum" and dtype == torch.float16:
++            tol_args = {"atol": 0.125, "rtol": 1.8e-3}
++        elif name == "sum" and dtype == torch.float32:
++            tol_args = {"atol": 2.2e-4, "rtol": 3e-6}
++        else:
++            tol_args = {}
+ 
+         def fn(x, y):
+             return reduction_fn(x + y, dim=-1)
+ 
+         reduction_fn = getattr(torch, name)
+         args = [torch.randn(1, 1024**2, device="cuda", dtype=dtype) for _ in range(2)]
+-        self.run_and_check(fn, args)
++        self.run_and_check(fn, args, **tol_args)
+ 
+     def test_bool_reduction_fns(self):
+         def fn(x, y):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-test_freeze_conv_relu_fusion.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-test_freeze_conv_relu_fusion.patch
new file mode 100644
index 00000000000..aee2499875f
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-test_freeze_conv_relu_fusion.patch
@@ -0,0 +1,23 @@
+Avoid failures in test_jit, test_jit_legacy, test_jit_profiling
+
+> Mismatched elements: 7 / 30 (23.3%)
+> Greatest absolute difference: 3.053247928619385e-05 at index (1, 1, 0, 0, 0) (up to 1e-05 allowed)
+> Greatest relative difference: 0.0004548609140329063 at index (3, 1, 0, 0, 0) (up to 1.3e-06 allowed)
+
+See https://github.com/pytorch/pytorch/issues/164249
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
+index 7da41f0cc71..e1b03524743 100644
+--- a/test/jit/test_freezing.py
++++ b/test/jit/test_freezing.py
+@@ -3032,7 +3032,7 @@ class TestFrozenOptimizations(JitTestCase):
+                             frozen_mod.graph
+                         )
+ 
+-                self.assertEqual(mod_eager(inp), frozen_mod(inp))
++                self.assertEqual(mod_eager(inp), frozen_mod(inp), atol=3.1e-5, rtol=5e-4)
+ 
+     @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
+     def test_freeze_conv_relu_fusion_not_forward(self):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_init-cutlass-include-dirs.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_init-cutlass-include-dirs.patch
new file mode 100644
index 00000000000..09482734247
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_init-cutlass-include-dirs.patch
@@ -0,0 +1,24 @@
+Init cutlass when getting include directories.
+Otherwise config.cuda.cutlass_dir will be empty.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
+index dc9f5c25365..17895251eb5 100644
+--- a/torch/_inductor/codecache.py
++++ b/torch/_inductor/codecache.py
+@@ -2824,10 +2824,14 @@ def _cuda_compiler() -> Optional[str]:
+ 
+ 
+ def _cutlass_include_paths() -> list[str]:
++    from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
+     if config.is_fbcode():
+         from libfb.py import parutil
+ 
+         cutlass_path = parutil.get_dir_path("cutlass-3-headers")
++    elif not try_import_cutlass():
++        log.warning("CUTLASS not available, not adding include paths")
++        return []
+     else:
+         cutlass_path = config.cuda.cutlass_dir
+     return [
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_keep-CMAKE_PREFIX_PATH-in-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_keep-CMAKE_PREFIX_PATH-in-test.patch
new file mode 100644
index 00000000000..2234c5d9e76
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_keep-CMAKE_PREFIX_PATH-in-test.patch
@@ -0,0 +1,27 @@
+> pytorch-v2.7.1/test/inductor/test_aot_inductor_package.py", line 242, in test_compile_after_package
+>    self.assertTrue(so_path.exists())
+> AssertionError: False is not true
+
+Caused by:
+```
+/software/binutils/2.42-GCCcore-13.3.0/bin/ld: cannot find -labsl::utility: No such file or directory
+/software/binutils/2.42-GCCcore-13.3.0/bin/ld: cannot find -labsl::variant: No such file or directory
+collect2: error: ld returned 1 exit status
+```
+See https://github.com/pytorch/pytorch/pull/161907
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py
+index 28e01a40e9d..f281f2de938 100644
+--- a/test/inductor/test_aot_inductor_package.py
++++ b/test/inductor/test_aot_inductor_package.py
+@@ -229,7 +229,7 @@ class TestAOTInductorPackage(TestCase):
+                 # Create a build directory to run cmake
+                 build_path.mkdir()
+                 custom_env = os.environ.copy()
+-                custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
++                custom_env["CMAKE_PREFIX_PATH"] = ":".join([str(Path(torch.__file__).parent)] + os.environ.get("CMAKE_PREFIX_PATH", "").split(":"))
+                 subprocess.run(
+                     ["cmake", ".."],
+                     cwd=build_path,
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-faulty-close.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-faulty-close.patch
new file mode 100644
index 00000000000..c0a89a46d9a
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-faulty-close.patch
@@ -0,0 +1,29 @@
+Avoid closing random file handles in Inductor
+
+The `close` call closes random file handles.
+In some tests this seems to close "fd=1", i.e. stdout.
+Sebsequent writes/print then fails with
+> OSError: [Errno 9] Bad file descriptor
+
+See https://github.com/pytorch/pytorch/pull/169065
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
+--- a/torch/_inductor/autotune_process.py
++++ b/torch/_inductor/autotune_process.py
+@@ -926,14 +926,6 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest):
+             *self.extra_args,
+         )
+ 
+-    def cleanup_run_fn(self) -> None:
+-        if self.DLL is not None:
+-            """
+-            Check close attr due to it crash on Windows.
+-            """
+-            if hasattr(self.DLL, "close"):
+-                self.DLL.close()
+-
+     def __str__(self) -> str:
+         return f"{self.kernel_name=}"
+ 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-test_close_multi_pg_unordered.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-test_close_multi_pg_unordered.patch
new file mode 100644
index 00000000000..a664ff7d4ce
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-test_close_multi_pg_unordered.patch
@@ -0,0 +1,47 @@
+Remove a test that fails upstream too.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index f5a35c98d13..c8db144b234 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -758,38 +758,6 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
+         with self.assertRaises(dist.DistBackendError):
+             pg.allreduce([t])
+ 
+-    @requires_nccl()
+-    @skip_but_pass_in_sandcastle_if(
+-        torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs"
+-    )
+-    def test_close_multi_pg_unordered(self):
+-        store = c10d.FileStore(self.file_name, self.world_size)
+-        pg = self._create_process_group_nccl(store, self.opts())
+-        device = self.rank_to_GPU[self.rank][0]
+-        t = torch.rand(10, 10, device=device)
+-        # First allreduce to initialize default PG's communicator.
+-        pg.allreduce(t).wait()
+-        new_pg1 = c10d.new_group([0, 1])
+-        new_pg2 = c10d.new_group([0, 1])
+-        if self.rank == 0 or self.rank == 1:
+-            t1 = torch.rand(10, 10, device=device)
+-            t2 = torch.rand(10, 10, device=device)
+-            new_pg1.allreduce(t1).wait()
+-            new_pg2.allreduce(t2).wait()
+-        if self.rank == 0:
+-            dist.destroy_process_group(new_pg2)
+-            # force destruction of pg2 first
+-            del new_pg2
+-            dist.destroy_process_group(new_pg1)
+-            del new_pg1
+-        if self.rank == 1:
+-            c10d.destroy_process_group(new_pg1)
+-            # force destruction of pg1 first
+-            del new_pg1
+-            dist.destroy_process_group(new_pg2)
+-            del new_pg2
+-        dist.destroy_process_group()
+-
+     @requires_nccl()
+     @skip_but_pass_in_sandcastle_if(
+         torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs"
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_serialize-test_host_memory_stats.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_serialize-test_host_memory_stats.patch
new file mode 100644
index 00000000000..259033e0d64
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_serialize-test_host_memory_stats.patch
@@ -0,0 +1,30 @@
+Fix test_cuda.py TestCuda.test_host_memory_stats
+> AssertionError: Scalars are not equal!
+> Expected 50333384 but got 0.
+
+
+From 7abca8cebac9e399151af771233ee2f5d202c5e6 Mon Sep 17 00:00:00 2001
+From: eqy <eddiey@nvidia.com>
+Date: Thu, 1 May 2025 00:53:15 +0000
+Subject: [PATCH] Decorate `test_host_memory_stats` with `@serialTest`
+ (#152454)
+
+Seems to need it as it is expecting only its allocation behavior to be visible, to address #152422
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/152454
+Approved by: https://github.com/Skylion007
+---
+ test/test_cuda.py | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/test/test_cuda.py b/test/test_cuda.py
+index 93a10072d832..c74f099358f3 100644
+--- a/test/test_cuda.py
++++ b/test/test_cuda.py
+@@ -165,6 +165,7 @@ def test_pinned_memory_with_cudaregister_multithread(self):
+         for thread in threads:
+             thread.join()
+ 
++    @serialTest
+     def test_host_memory_stats(self):
+         # Helper functions
+         def empty_stats():
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-NCCL-tests-without-GPUs.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-NCCL-tests-without-GPUs.patch
new file mode 100644
index 00000000000..f658ecc7b89
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-NCCL-tests-without-GPUs.patch
@@ -0,0 +1,34 @@
+Some tests that require NCCL also use GPUs. Skip those tests when none are available.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
+index 2a8fc04265c..f62678656d0 100644
+--- a/torch/testing/_internal/common_distributed.py
++++ b/torch/testing/_internal/common_distributed.py
+@@ -43,6 +43,7 @@ from torch.testing._internal.common_utils import (
+     TEST_WITH_TSAN,
+     TestCase,
+     run_tests,
++    TEST_CUDA,
+     TEST_HPU,
+     TEST_XPU,
+ )
+@@ -327,6 +328,8 @@ def requires_gloo():
+ 
+ 
+ def requires_nccl_version(version, msg):
++    if not TEST_CUDA:
++        return skip_but_pass_in_sandcastle(TEST_SKIPS["no_cuda"].message)
+     if not c10d.is_nccl_available():
+         return skip_but_pass_in_sandcastle(
+             "c10d was not compiled with the NCCL backend",
+@@ -339,6 +342,8 @@ def requires_nccl_version(version, msg):
+ 
+ 
+ def requires_nccl():
++    if not TEST_CUDA:
++        return skip_but_pass_in_sandcastle(TEST_SKIPS["no_cuda"].message)
+     return skip_but_pass_in_sandcastle_if(
+         not c10d.is_nccl_available(),
+         "c10d was not compiled with the NCCL backend",
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-TestFP8Lowering.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-TestFP8Lowering.patch
new file mode 100644
index 00000000000..b13c657caf4
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-TestFP8Lowering.patch
@@ -0,0 +1,20 @@
+The test fails also with the official PyPI package:
+> torch/_inductor/select_algorithm.py:1869] [0/0] AssertionError: Input shapes should have M >= 16, N >= 16 and K >= 32
+> ...
+> torch._inductor.exc.InductorError: CompilationError: at 56:18:
+
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py
+--- a/test/inductor/test_fp8.py
++++ b/test/inductor/test_fp8.py
+@@ -411,6 +411,7 @@ class TestFP8Types(TestCase):
+         )
+ 
+ 
++@unittest.skip("Fails on H100s")
+ @instantiate_parametrized_tests
+ class TestFP8Lowering(TestCase):
+     @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM")
+a
\ No newline at end of file
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-max_autotune-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-max_autotune-tests.patch
new file mode 100644
index 00000000000..d17ceaa6c71
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-max_autotune-tests.patch
@@ -0,0 +1,60 @@
+Skip ~17 testcases in inductor/test_max_autotune.py which fail on H100 GPUs.
+See https://github.com/pytorch/pytorch/issues/160305
+
+> torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index
+OR
+> Mismatched elements: 41585 / 41664 (99.8%)
+> Greatest absolute difference: 155.375 at index (9, 206) (up to 0.01 allowed)
+> Greatest relative difference: 1913.0 at index (42, 58) (up to 0.01 allowed)
+
+Tests are generated and names look like
+> test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True
+> test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
+index 741353fdbf5..49656bd2062 100644
+--- a/test/inductor/test_max_autotune.py
++++ b/test/inductor/test_max_autotune.py
+@@ -26,7 +26,7 @@ from torch._inductor.select_algorithm import (
+     AlgorithmSelectorCache,
+     TritonTemplateCaller,
+ )
+-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
++from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8, SM90OrLater
+ from torch.testing._internal.common_utils import (
+     instantiate_parametrized_tests,
+     IS_WINDOWS,
+@@ -221,6 +221,7 @@ class TestMaxAutotune(TestCase):
+         with config.patch({"max_autotune": True, "autotune_in_subproc": True}):
+             torch.compile(mm, dynamic=dynamic)(a, b)
+ 
++    @unittest.skipIf(SM90OrLater, "Fails on H100+")
+     @unittest.skipIf(
+         not has_triton_tma_device(), "Need device-side TMA support in Triton"
+     )
+@@ -394,6 +395,7 @@ class TestMaxAutotune(TestCase):
+             Y = addmm(x, a, b)
+             torch.testing.assert_close(Y_compiled, Y, atol=1e-2, rtol=1e-2)
+ 
++    @unittest.skipIf(SM90OrLater, "Fails on H100+")
+     @unittest.skipIf(
+         not has_triton_tma_device(), "Need device-side TMA support in Triton"
+     )
+@@ -999,6 +1001,7 @@ class TestMaxAutotune(TestCase):
+         act = f(x, y)
+         torch.testing.assert_close(act, ref, atol=2e-2, rtol=1e-2)
+ 
++    @unittest.skipIf(SM90OrLater, "Fails on H100+")
+     def test_non_contiguous_input_addmm(self):
+         b = torch.randn((768), dtype=torch.bfloat16, device=GPU_TYPE)
+         x = rand_strided(
+@@ -1372,6 +1375,7 @@ class TestPrologueFusion(TestCase):
+             .run(code[0])
+         )
+ 
++    @unittest.skip("Fails in various setups, see issue 154228")
+     @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM")
+     @unittest.skipIf(
+         not PLATFORM_SUPPORTS_FP8,
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-schedule-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-schedule-test.patch
new file mode 100644
index 00000000000..be51dea09cc
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-schedule-test.patch
@@ -0,0 +1,21 @@
+
+test_schedule_with_native_zero_bubble_ScheduleClass0 fails upstream:
+https://github.com/pytorch/pytorch/issues/156088
+
+Disable just this single test
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/pipelining/test_schedule_multiproc.py b/test/distributed/pipelining/test_schedule_multiproc.py
+index 8491881f7fe..cbe63df2a77 100644
+--- a/test/distributed/pipelining/test_schedule_multiproc.py
++++ b/test/distributed/pipelining/test_schedule_multiproc.py
+@@ -519,7 +519,7 @@ class ScheduleTest(MultiProcContinousTest):
+ 
+     @requires_nccl()
+     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+-    @parametrize("ScheduleClass", [ScheduleWithW, ScheduleInterleavedZeroBubble])
++    @parametrize("ScheduleClass", [ScheduleInterleavedZeroBubble])
+     def test_schedule_with_native_zero_bubble(self, ScheduleClass):
+         print(ScheduleClass)
+         if ScheduleClass is ScheduleInterleavedZeroBubble:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_benchmark_on_non_zero_device.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_benchmark_on_non_zero_device.patch
new file mode 100644
index 00000000000..74029fdabc1
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_benchmark_on_non_zero_device.patch
@@ -0,0 +1,21 @@
+inductor/test_benchmark_fusion.py BenchmarkingTest.test_benchmark_on_non_zero_device fails with
+>     self.assertTrue(hit_count > 0)
+> AssertionError: False is not true
+
+Related: https://github.com/pytorch/pytorch/issues/160514
+
+Author: Alexander Grund (TU Dresden)
+
+--- a/test/inductor/test_benchmark_fusion.py
++++ b/test/inductor/test_benchmark_fusion.py
+@@ -190,9 +190,7 @@ if HAS_CUDA:
+     copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionCudaTest, "cuda")
+ 
+     class BenchmarkingTest(TestCase):
+-        @unittest.skipIf(
+-            torch.cuda.device_count() < 2, "The test need at least 2 devices"
+-        )
++        @unittest.skip("Mocking fails")
+         def test_benchmark_on_non_zero_device(self):
+             hit_count = 0
+             with torch.cuda.device("cuda:0"):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_data_parallel_rnn.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_data_parallel_rnn.patch
new file mode 100644
index 00000000000..5b81095e931
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_data_parallel_rnn.patch
@@ -0,0 +1,28 @@
+Failing upstream too: https://github.com/pytorch/pytorch/issues/162745
+> /PyTorch/2.7.1/foss-2024a-CUDA-12.6.0/pytorch-v2.7.1/test/distributed/test_data_parallel.py", line 99, in test_data_parallel_rnn
+>     self.assertTrue(p1.allclose(p2))
+> AssertionError: False is not true
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py
+index 26f64df90d9..c25cc6673c3 100644
+--- a/test/distributed/test_data_parallel.py
++++ b/test/distributed/test_data_parallel.py
+@@ -6,6 +6,7 @@ import io
+ from collections import OrderedDict
+ from copy import deepcopy
+ from itertools import product
++import unittest
+ 
+ import torch
+ import torch.nn.functional as F
+@@ -63,7 +64,7 @@ class TestDataParallel(TestCase):
+ 
+         gradcheck(fn, (m.t_rg,))
+ 
+-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
++    @unittest.skip("Fails")
+     def test_data_parallel_rnn(self):
+         class TestModule(torch.nn.Module):
+             def __init__(self) -> None:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch
new file mode 100644
index 00000000000..bb10b104456
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch
@@ -0,0 +1,16 @@
+Skip a test meant for CI only.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_cuda.py b/test/test_cuda.py
+index 3726c377970..78b5e8c8af9 100644
+--- a/test/test_cuda.py
++++ b/test/test_cuda.py
+@@ -3633,6 +3633,7 @@ print(f"{{r1}}, {{r2}}")
+         x = torch.cuda.device_count()
+         self.assertEqual(f"{x}, 1", r)
+ 
++    @unittest.skip("Not applicable")
+     def test_gds_fails_in_ci(self):
+         if IS_WINDOWS or TEST_WITH_ROCM:
+             error_msg = "is not supported on this platform"
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_lowering_one_shot_all_reduce.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_lowering_one_shot_all_reduce.patch
new file mode 100644
index 00000000000..214615bb589
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_lowering_one_shot_all_reduce.patch
@@ -0,0 +1,26 @@
+Test fails also with PYPI version
+> AssertionError: 'one_shot_all_reduce' not found in '# AOT ID[...]
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
+index 34b8ed5a7b1..bf0fbe0e7f2 100644
+--- a/test/distributed/test_symmetric_memory.py
++++ b/test/distributed/test_symmetric_memory.py
+@@ -1,7 +1,7 @@
+ # Owner(s): ["module: c10d"]
+ 
+ import os
+-from unittest import skipIf
++from unittest import skipIf, skip
+ 
+ import torch
+ import torch.distributed as dist
+@@ -981,6 +981,7 @@ class LoweringTest(MultiProcessTestCase):
+ 
+         torch._inductor.config._collective.auto_select = True
+ 
++    @skip("Fails with PyPI too")
+     @skipIfRocm  # requires registered-buffer support
+     @skip_if_lt_x_gpu(2)
+     @fresh_inductor_cache()
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch
new file mode 100644
index 00000000000..e745a728208
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch
@@ -0,0 +1,14 @@
+Test fails upstream too, see https://github.com/pytorch/pytorch/issues/147853
+> RuntimeError: Expected to find ".to(" but did not find it
+
+Author: Alexander Grund (TU Dresden)
+--- a/test/inductor/test_pattern_matcher.py
++++ b/test/inductor/test_pattern_matcher.py
+@@ -389,6 +389,7 @@ class TestPatternMatcher(TestCase):
+         }
+     )
+     @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
++    @unittest.skip("Fails")
+     def test_mixed_mm_exhaustive_dtypes(self):
+         def fn(a, b):
+             return torch.mm(a, b.to(a.dtype))
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_outside_linear_module_free.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_outside_linear_module_free.patch
new file mode 100644
index 00000000000..79bdea43a4d
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_outside_linear_module_free.patch
@@ -0,0 +1,26 @@
+Test failing with PYPI package too:
+>    self.assertTrue(cleared)
+> AssertionError: False is not true
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
+index 7541bd3b9d8..d0cb310bec6 100644
+--- a/test/dynamo/test_misc.py
++++ b/test/dynamo/test_misc.py
+@@ -10992,6 +10992,7 @@ fn
+             lambda mod: mod,
+         )
+ 
++    @unittest.skip("Unreliable")
+     def test_outside_linear_module_free(self):
+         # Compared to test_linear_module_free, the linear
+         # layer is not the code object that is directly compiled.
+@@ -11026,6 +11026,7 @@ fn
+         gc.collect()
+         self.assertTrue(cleared)
+ 
++    @unittest.skip("Unreliable")
+     def test_parameter_free(self):
+         def model_inp_ctr():
+             param = torch.nn.Parameter(torch.randn(100, 100))
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_override-without-CUDA.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_override-without-CUDA.patch
new file mode 100644
index 00000000000..4ec8ac594b7
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_override-without-CUDA.patch
@@ -0,0 +1,33 @@
+This test fails during creation of the tests at startup:
+>    File "/var/lib/jenkins/workspace/test/test_overrides.py", line 683, in _simple_type_parser
+>     return torch.Stream()
+> RuntimeError: CUDA error: CUDA driver version is insufficient for CUDA runtime version
+
+See https://github.com/pytorch/pytorch/pull/166625
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_overrides.py b/test/test_overrides.py
+--- a/test/test_overrides.py
++++ b/test/test_overrides.py
+@@ -10,9 +10,8 @@ import pickle
+ import collections
+ import unittest
+ import contextlib
+-import os
+ 
+-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_CROSSREF, TEST_WITH_TORCHDYNAMO
++from torch.testing._internal.common_utils import TestCase, run_tests, TEST_CUDA, TEST_WITH_CROSSREF, TEST_WITH_TORCHDYNAMO
+ from torch.overrides import (
+     handle_torch_function,
+     has_torch_function,
+@@ -31,8 +30,7 @@ from torch.utils._pytree import tree_map
+ 
+ Tensor = torch.Tensor
+ 
+-if os.getenv("ATEN_CPU_CAPABILITY") in ("default", "avx2"):
+-    # This test is not supported on ARM
++if not TEST_CUDA:
+     print(
+         "Skipping due to failing when cuda build runs on non cuda machine, "
+         + "see https://github.com/pytorch/pytorch/pull/150059 for example"
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch
new file mode 100644
index 00000000000..ee60c76ddbc
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch
@@ -0,0 +1,34 @@
+Avoid it failing on e.g. A100:
+
+> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721] RuntimeError: CUDA error: device-side assert triggered...
+> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721]  exiting process 1 with exit code: 10
+> ...
+> :318: st_vec: block: [0,0,0], thread: [87,0,0] Assertion `false` failed.
+> /pytorch-v2.7.1/torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h:318: st_vec: block: [0,0,0], thread: [88,0,0] Assertion `false` failed.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index 7410255d27a..603ea0b375b 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -3367,7 +3367,7 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
+     @skip_if_rocm_multiprocess
+     def test_intra_node_comm_all_reduce(self):
+         from torch._C._distributed_c10d import _get_intra_node_comm_usage_counter
+-        from torch.testing._internal.common_cuda import SM80OrLater
++        from torch.testing._internal.common_cuda import SM90OrLater
+ 
+         for peer in range(self.world_size):
+             if peer == self.rank:
+@@ -3375,8 +3375,8 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
+             if not torch._C._cuda_canDeviceAccessPeer(self.rank, peer):
+                 raise SkipTest("Test requires p2p access")
+ 
+-        if not SM80OrLater:
+-            raise SkipTest("Test requires sm>=80")
++        if not SM90OrLater:
++            raise SkipTest("Test requires sm>=90")
+ 
+         store = c10d.FileStore(self.file_name, self.world_size)
+         os.environ["ENABLE_INTRA_NODE_COMM"] = "1"
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-cc89.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-cc89.patch
new file mode 100644
index 00000000000..de560c007da
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-cc89.patch
@@ -0,0 +1,43 @@
+Avoid this error in a function called by those tests:
+> RuntimeError: torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
+index 34b8ed5a7b1..04d3bb7a959 100644
+--- a/test/distributed/test_symmetric_memory.py
++++ b/test/distributed/test_symmetric_memory.py
+@@ -1,7 +1,7 @@
+ # Owner(s): ["module: c10d"]
+ 
+ import os
+-from unittest import skipIf, skip
++from unittest import skipIf, skip, skipUnless
+ 
+ import torch
+ import torch.distributed as dist
+@@ -19,7 +19,7 @@ from torch.distributed._symmetric_memory import (
+     restride_A_for_fused_matmul_reduce_scatter,
+     restride_A_shard_for_fused_all_gather_matmul,
+ )
+-from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater
++from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM89OrLater, SM90OrLater
+ from torch.testing._internal.common_distributed import (
+     MultiProcessTestCase,
+     requires_multicast_support,
+@@ -458,6 +458,7 @@ class SymmetricMemoryTest(MultiProcessTestCase):
+ 
+     @skipIfRocm
+     @skip_if_lt_x_gpu(2)
++    @skipUnless(SM89OrLater, "compute capability >= 8.9")
+     @parametrize("gather_dim", [0, 1])
+     @parametrize(
+         "scale_mode", ["tensor-wise", "row-wise-replicated", "row-wise-sharded"]
+@@ -576,6 +577,7 @@ class SymmetricMemoryTest(MultiProcessTestCase):
+ 
+     @skipIfRocm
+     @skip_if_lt_x_gpu(2)
++    @skipUnless(SM89OrLater, "compute capability >= 8.9")
+     @parametrize("scatter_dim", [0, 1])
+     @parametrize("rowwise", [True, False])
+     def test_fused_scaled_matmul_reduce_scatter(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch
new file mode 100644
index 00000000000..6e8cdfb2d36
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch
@@ -0,0 +1,27 @@
+When the GPUs use 64bit BARs the RPC module fails during the initialization with:
+> E           RuntimeError: In getBar1SizeOfGpu at tensorpipe/channel/cuda_gdr/context_impl.cc:242 "": No such file or directory
+
+This causes KeyboardInterrupt errors in distributed/rpc/test_share_memory
+
+See https://github.com/pytorch/pytorch/issues/159354
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc b/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc
+index 182a04a..b26751e 100644
+--- a/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc
++++ b/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc
+@@ -239,6 +239,13 @@ size_t getBar1SizeOfGpu(int gpuIdx) {
+ 
+   struct stat bar1Stats;
+   int rv = ::stat(pciPath.c_str(), &bar1Stats);
++  if (rv < 0 && errno == ENOENT) {
++    // Some GPUs use 64 bit BARs using 2 slots each,
++    // so the BAR 0 spans slots 0 & 1 and BAR 1 is at slots 2 & 3
++    TP_VLOG(5) << "GPU #" << gpuIdx << " might has 64 bit BARs";
++    pciPath[pciPath.size() - 1] = '2';
++    rv = ::stat(pciPath.c_str(), &bar1Stats);
++  }
+   TP_THROW_SYSTEM_IF(rv < 0, errno);
+ 
+   return bar1Stats.st_size;
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch
new file mode 100644
index 00000000000..c58d35aacaf
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch
@@ -0,0 +1,23 @@
+Avoid failures in test_nn.py test_partial_flat_weights
+
+> Mismatched elements: 9 / 36 (25.0%)
+> Greatest absolute difference: 3.013014793395996e-05 at index (2, 0, 4) (up to 1e-05 allowed)
+> Greatest relative difference: 0.0030790010932832956 at index (2, 0, 4) (up to 1.3e-06 allowed)
+
+See https://github.com/pytorch/pytorch/issues/163072
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_nn.py b/test/test_nn.py
+index 30609247cb1..02a2d3a7f3a 100644
+--- a/test/test_nn.py
++++ b/test/test_nn.py
+@@ -4299,7 +4299,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
+         inp = inp.cuda()
+         # otherwise, subsequent warnings will be hidden, and further tests rely on them
+         warnings.simplefilter("always")
+-        self.assertEqual(m(inp)[0].cpu(), out_expected[0])
++        self.assertEqual(m(inp)[0].cpu(), out_expected[0], atol=3.1e-5, rtol=3.1e-3)
+ 
+     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+     @set_default_dtype(torch.double)
diff --git a/easybuild/easyconfigs/p/pyzstd/pyzstd-0.16.2-GCCcore-13.3.0.eb b/easybuild/easyconfigs/p/pyzstd/pyzstd-0.16.2-GCCcore-13.3.0.eb
new file mode 100644
index 00000000000..bee4110b7b5
--- /dev/null
+++ b/easybuild/easyconfigs/p/pyzstd/pyzstd-0.16.2-GCCcore-13.3.0.eb
@@ -0,0 +1,29 @@
+easyblock = 'PythonPackage'
+
+name = 'pyzstd'
+version = '0.16.2'
+
+homepage = 'https://github.com/Rogdham/pyzstd'
+description = """Pyzstd module provides classes and functions for compressing and decompressing data,
+using Facebook's Zstandard (or zstd as short name) algorithm.
+
+The API style is similar to Python's bz2/lzma/zlib modules."""
+
+toolchain = {'name': 'GCCcore', 'version': '13.3.0'}
+
+sources = [SOURCE_TAR_GZ]
+checksums = ['179c1a2ea1565abf09c5f2fd72f9ce7c54b2764cf7369e05c0bfd8f1f67f63d2']
+
+builddependencies = [
+    ('binutils', '2.42'),
+]
+
+dependencies = [
+    ('Python', '3.12.3'),
+    ('zstd', '1.5.6'),
+]
+
+# Use preinstalled zstd library
+buildopts = installopts = '--config-settings="--build-option=--dynamic-link-zstd"'
+
+moduleclass = 'tools'