easybuilders · Flamefire · Oct 24, 2025 · Dec 9, 2025 · Dec 15, 2025 · Dec 18, 2025
diff --git a/easybuild/easyconfigs/c/cuDNN/cuDNN-9.10.2.21-CUDA-12.6.0.eb b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.10.2.21-CUDA-12.6.0.eb
@@ -0,0 +1,34 @@
+name = 'cuDNN'
+version = '9.10.2.21'
+versionsuffix = '-CUDA-%(cudaver)s'
+homepage = 'https://developer.nvidia.com/cudnn'
+description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is
+a GPU-accelerated library of primitives for deep neural networks."""
+
+toolchain = SYSTEM
+
+source_urls = [
+    'https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-%(cudnnarch)s/'
+]
+# note: cuDNN is tied to specific to CUDA versions,
+# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions
+sources = ['%(namelower)s-linux-%(cudnnarch)s-%(version)s_cuda%(cudamajver)s-archive.tar.xz']
+checksums = [{
+    '%(namelower)s-linux-sbsa-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
+        '4d57dceba3be27a68b078ce8630525bf40ab7f1b546eb45d0b363c3eeb55f8fa',
+    '%(namelower)s-linux-x86_64-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
+        'd0defcbc4c6dad711ff4cb66d254036a300c9071b07c7b64199aacab534313c1',
+}]
+
+dependencies = [('CUDA', '12.6.0')]
+
+sanity_check_paths = {
+    'files': [
+        'include/cudnn.h', 'lib64/libcudnn_adv_static.a', 'lib64/libcudnn_cnn_static.a',
+        'lib64/libcudnn_engines_precompiled_static.a', 'lib64/libcudnn_engines_runtime_compiled_static.a',
+        'lib64/libcudnn_graph_static.a', 'lib64/libcudnn_heuristic_static.a', 'lib64/libcudnn_ops_static.a',
+    ],
+    'dirs': ['include', 'lib64'],
+}
+
+moduleclass = 'numlib'
diff --git a/easybuild/easyconfigs/n/NCCL/NCCL-2.27.5-GCCcore-13.3.0-CUDA-12.6.0.eb b/easybuild/easyconfigs/n/NCCL/NCCL-2.27.5-GCCcore-13.3.0-CUDA-12.6.0.eb
@@ -0,0 +1,26 @@
+name = 'NCCL'
+version = '2.27.5'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://developer.nvidia.com/nccl'
+description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
+communication primitives that are performance optimized for NVIDIA GPUs."""
+
+toolchain = {'name': 'GCCcore', 'version': '13.3.0'}
+
+github_account = 'NVIDIA'
+source_urls = [GITHUB_SOURCE]
+sources = ['v%(version)s-1.tar.gz']
+checksums = ['e8a8972fc7f7517703510ef23608d41f6484db5331fca37827b4af3f66995344']
+
+builddependencies = [('binutils', '2.42')]
+
+dependencies = [
+    ('CUDA', '12.6.0', '', SYSTEM),
+    ('UCX-CUDA', '1.16.0', versionsuffix),
+]
+
+# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
+cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0']
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch
@@ -0,0 +1,57 @@
+Disable a test that has incomplete skip condition.
+See https://github.com/pytorch/pytorch/pull/167971
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index 0a0f3ee4ca2..aff8ba0156f 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -11,6 +11,7 @@ import sys
+ import tempfile
+ import threading
+ import time
++import unittest
+ import warnings
+ from contextlib import contextmanager
+ from datetime import datetime, timedelta
+@@ -295,12 +296,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
+         # But if we are in Sandcastle, `skip_but_pass_in_sandcastle` would return 0.
+         TEST_NAN_ASSERT_RETURN = 0 if IS_SANDCASTLE else signal.SIGABRT
+         self.special_return_code_checks = {
+-            self.test_nan_assert_float16.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float32.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float64.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_bfloat16.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float8_e4m3fn.__wrapped__: TEST_NAN_ASSERT_RETURN,
+-            self.test_nan_assert_float8_e5m2.__wrapped__: TEST_NAN_ASSERT_RETURN,
++
+         }
+
+         # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
+@@ -489,24 +485,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
+         torch.version.cuda is not None and int(torch.version.cuda.split(".")[0]) >= 12
+     )
+
+-    @requires_nccl()
+-    @skip_but_pass_in_sandcastle_if(
+-        # skip for cu126 as well due to https://github.com/pytorch/pytorch/issues/153479
+-        not (TEST_MULTIGPU and CUDA_12_AND_ABOVE),
+-        "NCCL test requires 2+ GPUs and Device side assert could cause unexpected errors in lower versions of CUDA",
+-    )
+-    @parametrize(
+-        "type",
+-        [
+-            torch.float16,
+-            torch.float32,
+-            torch.float64,
+-            torch.bfloat16,
+-            torch.float8_e4m3fn,
+-            torch.float8_e5m2,
+-        ],
+-    )
+-    @skip_if_rocm_multiprocess
++    @unittest.skip("Wrong conditions")
+     def test_nan_assert(self, type):
+         # Expecting a device-side error when NaN is detected
+         os.environ["TORCH_NCCL_NAN_CHECK"] = "1"
diff --git a/...onfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch b/...onfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch
@@ -0,0 +1,28 @@
+CudaGraphTreeTests.test_workspace_allocation_error fails if TORCH_DISABLE_ADDR2LINE=1 is set
+> File "/pytorch-v2.9.0/test/inductor/test_cudagraph_trees.py", line 1568, in test_workspace_allocation_error
+>     self.assertTrue(
+> AssertionError: False is not true
+
+See https://github.com/pytorch/pytorch/issues/103369
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
+--- a/test/inductor/test_cudagraph_trees.py
++++ b/test/inductor/test_cudagraph_trees.py
+@@ -5,6 +5,7 @@ import functools
+ import gc
+ import importlib
+ import itertools
++import os
+ import re
+ import sys
+ import unittest
+@@ -1543,6 +1544,7 @@ if HAS_CUDA_AND_TRITON:
+         @skipIfRocm
+         @unittest.skipUnless(IS_X86 and IS_LINUX, "cpp contexts are linux only")
+         @torch._inductor.config.patch("triton.cudagraph_trees_history_recording", True)
++        @unittest.mock.patch.dict(os.environ, {"TORCH_DISABLE_ADDR2LINE": "0"})
+         def test_workspace_allocation_error(self):
+             torch._C._cuda_clearCublasWorkspaces()
+
diff --git a/...easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch b/...easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch
@@ -0,0 +1,28 @@
+Many tests using Float16 on CPU fail with reference_in_float=False
+See https://github.com/pytorch/pytorch/issues/169809
+
+E.g.:
+> TestInductorOpInfoCPU.test_comprehensive_grid_sampler_2d_cpu_float16
+> [...]
+> Mismatched elements: 125 / 780 (16.0%)
+> Greatest absolute difference: 0.02001953125 at index (0, 1, 3, 2) (up to 1e-05 allowed)
+> Greatest relative difference: 2.34375 at index (1, 1, 2, 4) (up to 0.001 allowed)
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
+index 807ccb48a79..7e5740e0177 100644
+--- a/test/inductor/test_torchinductor_opinfo.py
++++ b/test/inductor/test_torchinductor_opinfo.py
+@@ -1329,8 +1329,10 @@ class TestInductorOpInfo(TestCase):
+                         # Triton
+                         if has_triton():
+                             adjusted_kwargs.update(
+-                                copy_to_gpu=False, reference_in_float=False
++                                copy_to_gpu=False,
+                             )
++                            if device_type == GPU_TYPE:
++                                adjusted_kwargs['reference_in_float'] = False
+
+                         # skip checking gradient on CPU for now
+                         if device_type == GPU_TYPE:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch
@@ -0,0 +1,59 @@
+From d55c9d52cda889850484968fc55ee73bf40540ec Mon Sep 17 00:00:00 2001
+From: Chien-Chin Huang <chienchin@fb.com>
+Date: Wed, 17 Sep 2025 18:14:51 -0700
+Subject: [PATCH] [CP] Fix cuDNN CP LSE dimension bug (#163231)
+
+We should only unsqueeze if necessary.
+
+Fix https://github.com/pytorch/pytorch/issues/162743
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/163231
+Approved by: https://github.com/eqy
+ghstack dependencies: #162539, #162540, #162541, #163115, #163131
+---
+ .../tensor/experimental/_attention.py          | 18 +++++++++++++++---
+ 1 file changed, 15 insertions(+), 3 deletions(-)
+
+diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
+index 6336967582429..a3345f37a170d 100644
+--- a/torch/distributed/tensor/experimental/_attention.py
++++ b/torch/distributed/tensor/experimental/_attention.py
+@@ -134,6 +134,7 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
+         self._seq_dim = seq_dim
+         self._out: Optional[torch.Tensor] = None
+         self._lse: Optional[torch.Tensor] = None
++        self._should_lse_squeeze = False
+         self._convert_to_f32 = convert_to_f32
+         self._out_dtype = torch.float32
+         self._lse_dtype = torch.float32
+@@ -141,7 +142,14 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
+     def _merge_one(
+         self, block_out: torch.Tensor, block_lse: torch.Tensor, partial: bool
+     ) -> None:
+-        block_lse = block_lse.unsqueeze(dim=-1)
++        # The cuDNN backend preserves the last dimension for LSE.
++        # Apply unsqueeze only if the input does not already have
++        # the required dimensionality.
++        if len(block_lse.shape) < len(block_out.shape):
++            block_lse = block_lse.unsqueeze(dim=-1)
++            self._should_lse_squeeze = True
++        assert len(block_lse.shape) == len(block_out.shape)
++
+         if self._lse is None:
+             self._lse = block_lse
+             self._out = block_out
+@@ -199,8 +207,12 @@ def step(self, out: torch.Tensor, lse: torch.Tensor, partial: bool) -> None:
+     def results(self) -> tuple[torch.Tensor, torch.Tensor]:
+         assert self._out is not None
+         assert self._lse is not None
+-        out, lse = self._out, self._lse.squeeze(-1)
+-        return out.to(self._out_dtype), lse.to(self._lse_dtype)
++        out = self._out.to(self._out_dtype)
++        if self._should_lse_squeeze:
++            lse = self._lse.squeeze(-1).to(self._lse_dtype)
++        else:
++            lse = self._lse.to(self._lse_dtype)
++        return out, lse
+
+
+ class _AttentionOp(Protocol):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch
@@ -0,0 +1,55 @@
+From 6702f545d880fd82700811e4a3508cdd76da9a69 Mon Sep 17 00:00:00 2001
+From: Alexander Grund <alexander.grund@tu-dresden.de>
+Date: Tue, 16 Sep 2025 17:37:06 +0000
+Subject: [PATCH] Restore environment after NcclUserBufferRegistrationTest
+ (#163063)
+
+This test sets "NCCL_ALGO=NVLS" in NcclUserBufferRegistrationTest which affects tests run in the same process such as `test_on_completion_hook_*` that fail with
+> invalid usage (run with NCCL_DEBUG=WARN for details), NCCL version 2.26.2
+> ncclInvalidUsage: This usually reflects invalid usage of NCCL library.
+> Last error:
+> Error : no algorithm/protocol available for function Broadcast with datatype ncclInt8. NCCL_ALGO was set to NVLS.
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/163063
+Approved by: https://github.com/ezyang
+---
+ test/distributed/test_c10d_nccl.py | 21 +++++++++++++--------
+ 1 file changed, 13 insertions(+), 8 deletions(-)
+
+diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
+index 0d55845228da..f44394e3148c 100644
+--- a/test/distributed/test_c10d_nccl.py
++++ b/test/distributed/test_c10d_nccl.py
+@@ -3145,19 +3145,24 @@ def test_invalid_nccl_blocking_wait_env(self):
+ class NcclUserBufferRegistrationTest(MultiProcessTestCase):
+     def setUp(self):
+         super().setUp()
+-        # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
+-        # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
+-        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+         nccl_debug_file = tempfile.NamedTemporaryFile()
+-        os.environ["NCCL_ALGO"] = "NVLS"
+-        os.environ["NCCL_DEBUG"] = "INFO"
+-        os.environ["NCCL_DEBUG_SUBSYS"] = "NVLS"
++        nccl_env = {
++            # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
++            # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
++            "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1",
++            "NCCL_ALGO": "NVLS",
++            "NCCL_DEBUG": "INFO",
++            "NCCL_DEBUG_SUBSYS": "NVLS",
++            "NCCL_DEBUG_FILE": nccl_debug_file.name,
++        }
+         if torch.cuda.nccl.version() >= (2, 24, 3):
+-            os.environ["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
+-        os.environ["NCCL_DEBUG_FILE"] = nccl_debug_file.name
++            nccl_env["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
++        self.env_patcher = mock.patch.dict(os.environ, nccl_env)
++        self.env_patcher.start()
+         self._spawn_processes()
+
+     def tearDown(self):
++        self.env_patcher.stop()
+         super().tearDown()
+         try:
+             os.remove(self.file_name)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch
@@ -0,0 +1,33 @@
+PadMMTest.test_exclude_padding fails on H100 with
+>     self.assertTrue(len(local_cache) == 2)
+> AssertionError: False is not true
+
+Increasing the size triggers the intended code.
+See https://github.com/pytorch/pytorch/pull/169177
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
+--- a/test/inductor/test_pad_mm.py
++++ b/test/inductor/test_pad_mm.py
+@@ -425,7 +426,10 @@ class PadMMTest(TestCase):
+         def mm(a, b):
+             return a @ b
+
+-        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
++        # Size must be big enough such that `is_mm_compute_bound` returns True and we need padding to 4 elements
++        # machine balance is ~8.3 (A100), 14.1 (H100), size must be 3x that, see arithmetic_intensity for M=N=K
++        size = [59, 59]
++        mm(torch.rand(size, device="cuda"), torch.rand(size, device="cuda"))
+         local_cache = get_pad_cache().get_local_cache()
+         self.assertTrue(len(local_cache) == 2)
+         FileCheck().check_count("exclude_pad:False", 2, exactly=True).run(
+@@ -436,7 +440,7 @@ class PadMMTest(TestCase):
+         def mm(a, b):
+             return (a + 1) @ b
+
+-        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
++        mm(torch.rand(size, device="cuda"), torch.rand(size, device="cuda"))
+         local_cache = get_pad_cache().get_local_cache()
+         # reuse original base timing
+         self.assertTrue(len(local_cache) == 3)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch
@@ -0,0 +1,27 @@
+TestSaveLoad.test_version_error causes a failure due to TEMPDIR being set by EasyBuild:
+
+> Ran into the following error when deserializing: [enforce fail at inline_container.cc:332] . file in archive is not in a subdirectory tmpi40i4vmn/: easybuild-tmp/archive_version
+
+Fix the code to handle that, see https://github.com/pytorch/pytorch/pull/169936
+
+diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
+index faef9b455a0..e3a463014fb 100644
+--- a/test/export/test_serialize.py
++++ b/test/export/test_serialize.py
+@@ -7,6 +7,7 @@ with test_sym_bool)
+ import copy
+ import io
+ import math
++import os
+ import tempfile
+ import unittest
+ import zipfile
+@@ -1915,7 +1916,7 @@ class TestSaveLoad(TestCase):
+             with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+                 save(ep, f.name)
+                 f.seek(0)
+-                file_prefix = f.name.split("/")[2].split(".")[0]
++                file_prefix = os.path.splitext(os.path.basename(f.name))[0]
+
+                 # Create a new file and copy things over, but modify the
+                 # archive version
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch
@@ -0,0 +1,29 @@
+Avoid PyTorch trying to use $HOME if XDG_CACHE_HOME is set.
+See https://github.com/pytorch/pytorch/pull/168232
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
+--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
++++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
+@@ -36,8 +36,18 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
+   if (writer_ == nullptr) {
+     // Attempt to write to running user's HOME directory cache folder - if it
+     // exists.
+-    auto homeDir = getCvarString({"HOME"}, "/tmp");
+-    auto cacheDirPath = std::filesystem::path(homeDir + "/.cache/torch");
++    #ifdef _WIN32
++        const char* cacheHome = nullptr;
++    #else
++        // Uses XDG_CACHE_HOME if it's set
++        const char* cacheHome = std::getenv("XDG_CACHE_HOME");
++    #endif
++    std::string cacheRoot;
++    if (cacheHome)
++      cacheRoot = cacheHome;
++    else
++      cacheRoot = getCvarString({"HOME"}, "/tmp") + "/.cache";
++    auto cacheDirPath = std::filesystem::path(cacheRoot + "/torch");
+     // Create the .cache directory if it doesn't exist
+     std::filesystem::create_directories(cacheDirPath);
+     auto defaultLocation = cacheDirPath / "comm_lib_trace_rank_";