easybuilders · boegel · Jan 20, 2026 · Sep 19, 2025 · Sep 23, 2025 · Oct 1, 2025
diff --git a/easybuild/easyconfigs/c/cuDNN/cuDNN-9.5.1.17-CUDA-12.6.0.eb b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.5.1.17-CUDA-12.6.0.eb
@@ -0,0 +1,34 @@
+name = 'cuDNN'
+version = '9.5.1.17'
+versionsuffix = '-CUDA-%(cudaver)s'
+homepage = 'https://developer.nvidia.com/cudnn'
+description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is
+a GPU-accelerated library of primitives for deep neural networks."""
+
+toolchain = SYSTEM
+
+source_urls = [
+    'https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-%(cudnnarch)s/'
+]
+# note: cuDNN is tied to specific to CUDA versions,
+# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions
+sources = ['%(namelower)s-linux-%(cudnnarch)s-%(version)s_cuda%(cudamajver)s-archive.tar.xz']
+checksums = [{
+    '%(namelower)s-linux-sbsa-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
+        '340c49b32c133b0321c5c5b00d14fb64887dcac83ee8fd24195d9191061f1ad7',
+    '%(namelower)s-linux-x86_64-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
+        '35dd20b9c68324ae1288ac36f66ab1f318d2bfecfafb703a82617aa283272be4',
+}]
+
+dependencies = [('CUDA', '12.6.0')]
+
+sanity_check_paths = {
+    'files': [
+        'include/cudnn.h', 'lib64/libcudnn_adv_static.a', 'lib64/libcudnn_cnn_static.a',
+        'lib64/libcudnn_engines_precompiled_static.a', 'lib64/libcudnn_engines_runtime_compiled_static.a',
+        'lib64/libcudnn_graph_static.a', 'lib64/libcudnn_heuristic_static.a', 'lib64/libcudnn_ops_static.a',
+    ],
+    'dirs': ['include', 'lib64'],
+}
+
+moduleclass = 'numlib'
diff --git a/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.6.3.2-CUDA-12.6.0.eb b/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.6.3.2-CUDA-12.6.0.eb
@@ -0,0 +1,32 @@
+easyblock = 'Tarball'
+
+name = 'cuSPARSELt'
+version = '0.6.3.2'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://docs.nvidia.com/cuda/cusparselt/index.html'
+description = """NVIDIA cuSPARSELt is a high-performance CUDA library dedicated to general matrix-matrix operations in
+which at least one operand is a sparse matrix"""
+
+toolchain = SYSTEM
+
+local_arch = {'arm64': 'sbsa', 'aarch64': 'sbsa'}.get(ARCH, ARCH)
+source_urls = ['https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-%s/' % local_arch]
+sources = ['libcusparse_lt-linux-%s-%%(version)s-archive.tar.xz' % local_arch]
+checksums = [{
+    'libcusparse_lt-linux-x86_64-%(version)s-archive.tar.xz':
+        'a2f856e78943f5c538bdef1c9edc64a5ed30bf8bb7d5fcb615c684ffe776cc31',
+    'libcusparse_lt-linux-sbsa-%(version)s-archive.tar.xz':
+        '3e420ddbff4eb9ac603f57c7aa8b3d5271112816e244eb55ef9f30c4eb6a04b7',
+}]
+
+dependencies = [('CUDA', '12.6.0')]
+
+sanity_check_paths = {
+    'files': ['include/cusparseLt.h',
+              'lib/libcusparseLt.%s' % SHLIB_EXT,
+              'lib/libcusparseLt_static.a'],
+    'dirs': [],
+}
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/n/NCCL/NCCL-2.26.2-GCCcore-13.3.0-CUDA-12.6.0.eb b/easybuild/easyconfigs/n/NCCL/NCCL-2.26.2-GCCcore-13.3.0-CUDA-12.6.0.eb
@@ -0,0 +1,26 @@
+name = 'NCCL'
+version = '2.26.2'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://developer.nvidia.com/nccl'
+description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
+communication primitives that are performance optimized for NVIDIA GPUs."""
+
+toolchain = {'name': 'GCCcore', 'version': '13.3.0'}
+
+github_account = 'NVIDIA'
+source_urls = [GITHUB_SOURCE]
+sources = ['v%(version)s-1.tar.gz']
+checksums = ['74c6ab40c864d79c2139508e9419de5970cb406ec85f001d5f834d5f5c0c4f3b']
+
+builddependencies = [('binutils', '2.42')]
+
+dependencies = [
+    ('CUDA', '12.6.0', '', SYSTEM),
+    ('UCX-CUDA', '1.16.0', versionsuffix),
+]
+
+# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
+cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0']
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch
@@ -0,0 +1,14 @@
+Avoid tripping on //caffe2/test/cpp/jit:test_custom_class_registrations with IS_SANDCASTLE
+
+Author: Alexander Grund (TU Dresden)
+--- a/torch/testing/_internal/torchbind_impls.py
++++ b/torch/testing/_internal/torchbind_impls.py
+@@ -116,8 +116,6 @@ def load_torchbind_test_lib():
+
+     if IS_MACOS:
+         raise unittest.SkipTest("non-portable load_library call used in test")
+-    elif IS_SANDCASTLE or IS_FBCODE:
+-        lib_file_path = Path("//caffe2/test/cpp/jit:test_custom_class_registrations")
+     elif IS_WINDOWS:
+         lib_file_path = find_library_location("torchbind_test.dll")
+     else:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_do-not-checkout-nccl.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_do-not-checkout-nccl.patch
@@ -0,0 +1,36 @@
+Don't checkout NCCL when using system NCCL
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py
+index 5dd5a221975..2b8b868eaa8 100644
+--- a/tools/build_pytorch_libs.py
++++ b/tools/build_pytorch_libs.py
+@@ -7,7 +7,12 @@ from glob import glob
+ from pathlib import Path
+
+ from .setup_helpers.cmake import CMake, USE_NINJA
+-from .setup_helpers.env import check_negative_env_flag, IS_64BIT, IS_WINDOWS
++from .setup_helpers.env import (
++    check_env_flag,
++    check_negative_env_flag,
++    IS_64BIT,
++    IS_WINDOWS,
++)
+
+
+ repo_root = Path(__file__).absolute().parent.parent
+@@ -119,7 +124,12 @@ def build_pytorch(
+     cmake: CMake,
+ ) -> None:
+     my_env = _create_build_env()
+-    checkout_nccl()
++    if (
++        not check_negative_env_flag("USE_CUDA")
++        and not check_negative_env_flag("USE_NCCL")
++        and not check_env_flag("USE_SYSTEM_NCCL")
++    ):
++        checkout_nccl()
+     build_test = not check_negative_env_flag("BUILD_TEST")
+     cmake.generate(
+         version, cmake_python_library, build_python, build_test, my_env, rerun_cmake
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch
@@ -0,0 +1,34 @@
+If there are no GPUs there would be a WORLD_SIZE=0 which doesn't work.
+Use a positive number for the NCCL/GLOO tests in that case.
+
+See https://github.com/pytorch/pytorch/pull/150764
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/run_test.py b/test/run_test.py
+index a508d8db4d2..e7bbe6ea086 100755
+--- a/test/run_test.py
++++ b/test/run_test.py
+@@ -610,18 +610,19 @@ DISTRIBUTED_TESTS_CONFIG = {}
+
+
+ if dist.is_available():
++    num_gpus = torch.cuda.device_count()
+     DISTRIBUTED_TESTS_CONFIG["test"] = {"WORLD_SIZE": "1"}
+     if not TEST_WITH_ROCM and dist.is_mpi_available():
+         DISTRIBUTED_TESTS_CONFIG["mpi"] = {
+             "WORLD_SIZE": "3",
+         }
+-    if dist.is_nccl_available():
++    if dist.is_nccl_available() and num_gpus > 0:
+         DISTRIBUTED_TESTS_CONFIG["nccl"] = {
+-            "WORLD_SIZE": f"{torch.cuda.device_count()}",
++            "WORLD_SIZE": f"{num_gpus}",
+         }
+-    if dist.is_gloo_available():
++    if dist.is_gloo_available() and num_gpus > 0:
+         DISTRIBUTED_TESTS_CONFIG["gloo"] = {
+             # TODO: retire testing gloo with CUDA
+-            "WORLD_SIZE": f"{torch.cuda.device_count()}",
++            "WORLD_SIZE": f"{num_gpus}",
+         }
+     # Test with UCC backend is deprecated.
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-skip-decorators.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-skip-decorators.patch
@@ -0,0 +1,112 @@
+The decorators are implemented to run when the function is called which is after
+the test `setup` method spawned subprocesses which may use NCCL to sync failing when there are
+not enough GPUs available.
+So replace the custom code by calls to the `unittest` skip decorators.
+See https://github.com/pytorch/pytorch/pull/109491
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
+index d34b1ffdb0a..8f9628f209b 100644
+--- a/torch/testing/_internal/common_distributed.py
++++ b/torch/testing/_internal/common_distributed.py
+@@ -155,17 +155,7 @@ def skip_if_odd_worldsize(func):
+
+
+ def require_n_gpus_for_nccl_backend(n, backend):
+-    def decorator(func):
+-        @wraps(func)
+-        def wrapper(*args, **kwargs):
+-            if backend == "nccl" and torch.cuda.device_count() < n:
+-                sys.exit(TEST_SKIPS[f"multi-gpu-{n}"].exit_code)
+-            else:
+-                return func(*args, **kwargs)
+-
+-        return wrapper
+-
+-    return decorator
++    return unittest.skipUnless(at_least_x_gpu(n), TEST_SKIPS[f"multi-gpu-{n}"].message) if backend == "nccl" else unittest.skipIf(False, None)
+
+
+ def import_transformers_or_skip():
+@@ -197,20 +187,10 @@ def at_least_x_gpu(x):
+
+
+ def skip_if_lt_x_gpu(x):
+-    def decorator(func):
+-        @wraps(func)
+-        def wrapper(*args, **kwargs):
+-            if torch.cuda.is_available() and torch.cuda.device_count() >= x:
+-                return func(*args, **kwargs)
+-            if TEST_HPU and torch.hpu.device_count() >= x:
+-                return func(*args, **kwargs)
+-            if TEST_XPU and torch.xpu.device_count() >= x:
+-                return func(*args, **kwargs)
+-            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+-
+-        return wrapper
+-
+-    return decorator
++    return unittest.skipUnless(torch.cuda.device_count() >= x or (
++                               TEST_HPU and torch.hpu.device_count() >= x) or (
++                               TEST_XPU and torch.xpu.device_count() >= x),
++                               TEST_SKIPS[f"multi-gpu-{x}"].message)
+
+
+ # This decorator helps avoiding initializing cuda while testing other backends
+diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
+index a4d6d53b975..0da1d9baddf 100644
+--- a/torch/testing/_internal/distributed/distributed_test.py
++++ b/torch/testing/_internal/distributed/distributed_test.py
+@@ -66,7 +66,6 @@ from torch.testing._internal.common_distributed import (
+     skip_if_small_worldsize,
+     skip_if_odd_worldsize,
+     skip_if_lt_x_gpu,
+-    nccl_skip_if_lt_x_gpu,
+     skip_if_no_gpu,
+     require_n_gpus_for_nccl_backend,
+     requires_nccl_version,
+@@ -5299,7 +5298,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(2, BACKEND)
+         def test_accumulate_gradients_no_sync(self):
+             """
+             Runs _test_accumulate_gradients_no_sync using default inputs
+@@ -5310,7 +5309,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(2, BACKEND)
+         def test_accumulate_gradients_no_sync_grad_is_view(self):
+             """
+             Runs _test_accumulate_gradients_no_sync using default inputs
+@@ -5321,7 +5320,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(2, BACKEND)
+         def test_accumulate_gradients_no_sync_allreduce_hook(self):
+             """
+             Runs multiple iterations on _test_accumulate_gradients_no_sync
+@@ -5349,7 +5348,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(2, BACKEND)
+         def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self):
+             """
+             Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce
+@@ -5383,7 +5382,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(2, BACKEND)
+         def test_get_future(self):
+             def mult(fut):
+                 return [t * 3 for t in fut.wait()]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_embedding.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_embedding.patch
@@ -0,0 +1,36 @@
+TestSelectAlgorithmCPU.test_linear_with_embedding fails when the CPU does not support BF16:
+> torch._inductor.exc.InductorError: LoweringException: RuntimeError: self and mat2 must have the same dtype, but got Float and BFloat16
+See https://github.com/pytorch/pytorch/issues/147104
+
+Convert the embedding layer to avoid it using "Float" and adapt the check for this change.
+
+Author: Alexander Grund (TU Dresden)
+--- a/test/inductor/test_cpu_select_algorithm.py
++++ b/test/inductor/test_cpu_select_algorithm.py
+@@ -932,6 +932,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
+     def test_linear_with_embedding(
+         self, batch_size, in_features, out_features, bias, dtype
+     ):
++        has_bf16 = torch.ops.mkldnn._is_mkldnn_bf16_supported()
+         class M(torch.nn.Module):
+             def __init__(self, bias):
+                 super().__init__()
+@@ -939,6 +940,9 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
+                     dtype=dtype
+                 )
+                 self.emb = torch.nn.Embedding(64, out_features)
++                if not has_bf16:
++                    self.emb = self.emb.to(dtype=dtype)
++
+
+             def forward(self, idx, x):
+                 return self.emb(idx) + self.linear(x)
+@@ -953,7 +957,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
+         with verify(dtype) as (atol, rtol):
+             self.common(mod, (idx, x), atol=atol, rtol=rtol)
+         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
+-        self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1)
++        self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1 if has_bf16 else 0)
+
+     @inductor_config.patch({"freezing": True})
+     @patches
diff --git a/.../easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch b/.../easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch
@@ -0,0 +1,17 @@
+This test fails when FlexiBLAS is used instead of MKL.
+Adjust the expected count.
+See https://github.com/pytorch/pytorch/pull/151548
+
+Author: Alexander Grund (TU Dresden)
+--- a/test/inductor/test_cpu_select_algorithm.py
++++ b/test/inductor/test_cpu_select_algorithm.py
+@@ -1300,7 +1304,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
+                 rtol=rtol,
+             )
+         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 2)
+-        self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2)
++        self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2 if TEST_MKL else 1)
+
+     @inductor_config.patch({"freezing": True})
+     @patches
+
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_skip-test_init_from_local_shards.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_skip-test_init_from_local_shards.patch
@@ -0,0 +1,24 @@
+The test often times out and seems to be considered flaky by PyTorch:
+https://github.com/pytorch/pytorch/issues/78068
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+index 730b2c2c0ac..5f9b9545700 100644
+--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
++++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+@@ -6,6 +6,7 @@ import itertools
+ import math
+ import pickle
+ import sys
++from unittest import skip
+
+ import torch
+ import torch.distributed as dist
+@@ -2432,6 +2432,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
+     @with_comms
+     @skip_if_lt_x_gpu(4)
+     @requires_nccl()
++    @skip("Times out often")
+     def test_init_from_local_shards(self):
+         local_shard_metadata = ShardMetadata(
+             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],