Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions easybuild/easyconfigs/c/cuDNN/cuDNN-9.10.2.21-CUDA-12.6.0.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name = 'cuDNN'
version = '9.10.2.21'
versionsuffix = '-CUDA-%(cudaver)s'
homepage = 'https://developer.nvidia.com/cudnn'
description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is
a GPU-accelerated library of primitives for deep neural networks."""

toolchain = SYSTEM

source_urls = [
'https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-%(cudnnarch)s/'
]
# note: cuDNN is tied to specific to CUDA versions,
# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions
sources = ['%(namelower)s-linux-%(cudnnarch)s-%(version)s_cuda%(cudamajver)s-archive.tar.xz']
checksums = [{
'%(namelower)s-linux-sbsa-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
'4d57dceba3be27a68b078ce8630525bf40ab7f1b546eb45d0b363c3eeb55f8fa',
'%(namelower)s-linux-x86_64-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
'd0defcbc4c6dad711ff4cb66d254036a300c9071b07c7b64199aacab534313c1',
}]

dependencies = [('CUDA', '12.6.0')]

sanity_check_paths = {
'files': [
'include/cudnn.h', 'lib64/libcudnn_adv_static.a', 'lib64/libcudnn_cnn_static.a',
'lib64/libcudnn_engines_precompiled_static.a', 'lib64/libcudnn_engines_runtime_compiled_static.a',
'lib64/libcudnn_graph_static.a', 'lib64/libcudnn_heuristic_static.a', 'lib64/libcudnn_ops_static.a',
],
'dirs': ['include', 'lib64'],
}

moduleclass = 'numlib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name = 'NCCL'
version = '2.27.5'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://developer.nvidia.com/nccl'
description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
communication primitives that are performance optimized for NVIDIA GPUs."""

toolchain = {'name': 'GCCcore', 'version': '13.3.0'}

github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
checksums = ['e8a8972fc7f7517703510ef23608d41f6484db5331fca37827b4af3f66995344']

builddependencies = [('binutils', '2.42')]

dependencies = [
('CUDA', '12.6.0', '', SYSTEM),
('UCX-CUDA', '1.16.0', versionsuffix),
]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0']

moduleclass = 'lib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
Disable a test that has incomplete skip condition.
See https://github.com/pytorch/pytorch/pull/167971

Author: Alexander Grund (TU Dresden)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 0a0f3ee4ca2..aff8ba0156f 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -11,6 +11,7 @@ import sys
import tempfile
import threading
import time
+import unittest
import warnings
from contextlib import contextmanager
from datetime import datetime, timedelta
@@ -295,12 +296,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
# But if we are in Sandcastle, `skip_but_pass_in_sandcastle` would return 0.
TEST_NAN_ASSERT_RETURN = 0 if IS_SANDCASTLE else signal.SIGABRT
self.special_return_code_checks = {
- self.test_nan_assert_float16.__wrapped__: TEST_NAN_ASSERT_RETURN,
- self.test_nan_assert_float32.__wrapped__: TEST_NAN_ASSERT_RETURN,
- self.test_nan_assert_float64.__wrapped__: TEST_NAN_ASSERT_RETURN,
- self.test_nan_assert_bfloat16.__wrapped__: TEST_NAN_ASSERT_RETURN,
- self.test_nan_assert_float8_e4m3fn.__wrapped__: TEST_NAN_ASSERT_RETURN,
- self.test_nan_assert_float8_e5m2.__wrapped__: TEST_NAN_ASSERT_RETURN,
+
}

# TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
@@ -489,24 +485,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
torch.version.cuda is not None and int(torch.version.cuda.split(".")[0]) >= 12
)

- @requires_nccl()
- @skip_but_pass_in_sandcastle_if(
- # skip for cu126 as well due to https://github.com/pytorch/pytorch/issues/153479
- not (TEST_MULTIGPU and CUDA_12_AND_ABOVE),
- "NCCL test requires 2+ GPUs and Device side assert could cause unexpected errors in lower versions of CUDA",
- )
- @parametrize(
- "type",
- [
- torch.float16,
- torch.float32,
- torch.float64,
- torch.bfloat16,
- torch.float8_e4m3fn,
- torch.float8_e5m2,
- ],
- )
- @skip_if_rocm_multiprocess
+ @unittest.skip("Wrong conditions")
def test_nan_assert(self, type):
# Expecting a device-side error when NaN is detected
os.environ["TORCH_NCCL_NAN_CHECK"] = "1"
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
CudaGraphTreeTests.test_workspace_allocation_error fails if TORCH_DISABLE_ADDR2LINE=1 is set
> File "/pytorch-v2.9.0/test/inductor/test_cudagraph_trees.py", line 1568, in test_workspace_allocation_error
> self.assertTrue(
> AssertionError: False is not true

See https://github.com/pytorch/pytorch/issues/103369

Author: Alexander Grund (TU Dresden)

diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
--- a/test/inductor/test_cudagraph_trees.py
+++ b/test/inductor/test_cudagraph_trees.py
@@ -5,6 +5,7 @@ import functools
import gc
import importlib
import itertools
+import os
import re
import sys
import unittest
@@ -1543,6 +1544,7 @@ if HAS_CUDA_AND_TRITON:
@skipIfRocm
@unittest.skipUnless(IS_X86 and IS_LINUX, "cpp contexts are linux only")
@torch._inductor.config.patch("triton.cudagraph_trees_history_recording", True)
+ @unittest.mock.patch.dict(os.environ, {"TORCH_DISABLE_ADDR2LINE": "0"})
def test_workspace_allocation_error(self):
torch._C._cuda_clearCublasWorkspaces()

Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Many tests using Float16 on CPU fail with reference_in_float=False
See https://github.com/pytorch/pytorch/issues/169809

E.g.:
> TestInductorOpInfoCPU.test_comprehensive_grid_sampler_2d_cpu_float16
> [...]
> Mismatched elements: 125 / 780 (16.0%)
> Greatest absolute difference: 0.02001953125 at index (0, 1, 3, 2) (up to 1e-05 allowed)
> Greatest relative difference: 2.34375 at index (1, 1, 2, 4) (up to 0.001 allowed)

Author: Alexander Grund (TU Dresden)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 807ccb48a79..7e5740e0177 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -1329,8 +1329,10 @@ class TestInductorOpInfo(TestCase):
# Triton
if has_triton():
adjusted_kwargs.update(
- copy_to_gpu=False, reference_in_float=False
+ copy_to_gpu=False,
)
+ if device_type == GPU_TYPE:
+ adjusted_kwargs['reference_in_float'] = False

# skip checking gradient on CPU for now
if device_type == GPU_TYPE:
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
From d55c9d52cda889850484968fc55ee73bf40540ec Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 17 Sep 2025 18:14:51 -0700
Subject: [PATCH] [CP] Fix cuDNN CP LSE dimension bug (#163231)

We should only unsqueeze if necessary.

Fix https://github.com/pytorch/pytorch/issues/162743

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163231
Approved by: https://github.com/eqy
ghstack dependencies: #162539, #162540, #162541, #163115, #163131
---
.../tensor/experimental/_attention.py | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
index 6336967582429..a3345f37a170d 100644
--- a/torch/distributed/tensor/experimental/_attention.py
+++ b/torch/distributed/tensor/experimental/_attention.py
@@ -134,6 +134,7 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
self._seq_dim = seq_dim
self._out: Optional[torch.Tensor] = None
self._lse: Optional[torch.Tensor] = None
+ self._should_lse_squeeze = False
self._convert_to_f32 = convert_to_f32
self._out_dtype = torch.float32
self._lse_dtype = torch.float32
@@ -141,7 +142,14 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
def _merge_one(
self, block_out: torch.Tensor, block_lse: torch.Tensor, partial: bool
) -> None:
- block_lse = block_lse.unsqueeze(dim=-1)
+ # The cuDNN backend preserves the last dimension for LSE.
+ # Apply unsqueeze only if the input does not already have
+ # the required dimensionality.
+ if len(block_lse.shape) < len(block_out.shape):
+ block_lse = block_lse.unsqueeze(dim=-1)
+ self._should_lse_squeeze = True
+ assert len(block_lse.shape) == len(block_out.shape)
+
if self._lse is None:
self._lse = block_lse
self._out = block_out
@@ -199,8 +207,12 @@ def step(self, out: torch.Tensor, lse: torch.Tensor, partial: bool) -> None:
def results(self) -> tuple[torch.Tensor, torch.Tensor]:
assert self._out is not None
assert self._lse is not None
- out, lse = self._out, self._lse.squeeze(-1)
- return out.to(self._out_dtype), lse.to(self._lse_dtype)
+ out = self._out.to(self._out_dtype)
+ if self._should_lse_squeeze:
+ lse = self._lse.squeeze(-1).to(self._lse_dtype)
+ else:
+ lse = self._lse.to(self._lse_dtype)
+ return out, lse


class _AttentionOp(Protocol):
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
From 6702f545d880fd82700811e4a3508cdd76da9a69 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Tue, 16 Sep 2025 17:37:06 +0000
Subject: [PATCH] Restore environment after NcclUserBufferRegistrationTest
(#163063)

This test sets "NCCL_ALGO=NVLS" in NcclUserBufferRegistrationTest which affects tests run in the same process such as `test_on_completion_hook_*` that fail with
> invalid usage (run with NCCL_DEBUG=WARN for details), NCCL version 2.26.2
> ncclInvalidUsage: This usually reflects invalid usage of NCCL library.
> Last error:
> Error : no algorithm/protocol available for function Broadcast with datatype ncclInt8. NCCL_ALGO was set to NVLS.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163063
Approved by: https://github.com/ezyang
---
test/distributed/test_c10d_nccl.py | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 0d55845228da..f44394e3148c 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -3145,19 +3145,24 @@ def test_invalid_nccl_blocking_wait_env(self):
class NcclUserBufferRegistrationTest(MultiProcessTestCase):
def setUp(self):
super().setUp()
- # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
- # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
- os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
nccl_debug_file = tempfile.NamedTemporaryFile()
- os.environ["NCCL_ALGO"] = "NVLS"
- os.environ["NCCL_DEBUG"] = "INFO"
- os.environ["NCCL_DEBUG_SUBSYS"] = "NVLS"
+ nccl_env = {
+ # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
+ # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
+ "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1",
+ "NCCL_ALGO": "NVLS",
+ "NCCL_DEBUG": "INFO",
+ "NCCL_DEBUG_SUBSYS": "NVLS",
+ "NCCL_DEBUG_FILE": nccl_debug_file.name,
+ }
if torch.cuda.nccl.version() >= (2, 24, 3):
- os.environ["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
- os.environ["NCCL_DEBUG_FILE"] = nccl_debug_file.name
+ nccl_env["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
+ self.env_patcher = mock.patch.dict(os.environ, nccl_env)
+ self.env_patcher.start()
self._spawn_processes()

def tearDown(self):
+ self.env_patcher.stop()
super().tearDown()
try:
os.remove(self.file_name)
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
PadMMTest.test_exclude_padding fails on H100 with
> self.assertTrue(len(local_cache) == 2)
> AssertionError: False is not true

Increasing the size triggers the intended code.
See https://github.com/pytorch/pytorch/pull/169177

Author: Alexander Grund (TU Dresden)

diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
--- a/test/inductor/test_pad_mm.py
+++ b/test/inductor/test_pad_mm.py
@@ -425,7 +426,10 @@ class PadMMTest(TestCase):
def mm(a, b):
return a @ b

- mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
+ # Size must be big enough such that `is_mm_compute_bound` returns True and we need padding to 4 elements
+ # machine balance is ~8.3 (A100), 14.1 (H100), size must be 3x that, see arithmetic_intensity for M=N=K
+ size = [59, 59]
+ mm(torch.rand(size, device="cuda"), torch.rand(size, device="cuda"))
local_cache = get_pad_cache().get_local_cache()
self.assertTrue(len(local_cache) == 2)
FileCheck().check_count("exclude_pad:False", 2, exactly=True).run(
@@ -436,7 +440,7 @@ class PadMMTest(TestCase):
def mm(a, b):
return (a + 1) @ b

- mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
+ mm(torch.rand(size, device="cuda"), torch.rand(size, device="cuda"))
local_cache = get_pad_cache().get_local_cache()
# reuse original base timing
self.assertTrue(len(local_cache) == 3)
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
TestSaveLoad.test_version_error causes a failure due to TEMPDIR being set by EasyBuild:

> Ran into the following error when deserializing: [enforce fail at inline_container.cc:332] . file in archive is not in a subdirectory tmpi40i4vmn/: easybuild-tmp/archive_version

Fix the code to handle that, see https://github.com/pytorch/pytorch/pull/169936

diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
index faef9b455a0..e3a463014fb 100644
--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@@ -7,6 +7,7 @@ with test_sym_bool)
import copy
import io
import math
+import os
import tempfile
import unittest
import zipfile
@@ -1915,7 +1916,7 @@ class TestSaveLoad(TestCase):
with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
save(ep, f.name)
f.seek(0)
- file_prefix = f.name.split("/")[2].split(".")[0]
+ file_prefix = os.path.splitext(os.path.basename(f.name))[0]

# Create a new file and copy things over, but modify the
# archive version
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Avoid PyTorch trying to use $HOME if XDG_CACHE_HOME is set.
See https://github.com/pytorch/pytorch/pull/168232

Author: Alexander Grund (TU Dresden)

diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
@@ -36,8 +36,18 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
if (writer_ == nullptr) {
// Attempt to write to running user's HOME directory cache folder - if it
// exists.
- auto homeDir = getCvarString({"HOME"}, "/tmp");
- auto cacheDirPath = std::filesystem::path(homeDir + "/.cache/torch");
+ #ifdef _WIN32
+ const char* cacheHome = nullptr;
+ #else
+ // Uses XDG_CACHE_HOME if it's set
+ const char* cacheHome = std::getenv("XDG_CACHE_HOME");
+ #endif
+ std::string cacheRoot;
+ if (cacheHome)
+ cacheRoot = cacheHome;
+ else
+ cacheRoot = getCvarString({"HOME"}, "/tmp") + "/.cache";
+ auto cacheDirPath = std::filesystem::path(cacheRoot + "/torch");
// Create the .cache directory if it doesn't exist
std::filesystem::create_directories(cacheDirPath);
auto defaultLocation = cacheDirPath / "comm_lib_trace_rank_";
Loading