Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions easybuild/easyconfigs/c/cuDNN/cuDNN-9.5.1.17-CUDA-12.6.0.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name = 'cuDNN'
version = '9.5.1.17'
versionsuffix = '-CUDA-%(cudaver)s'
homepage = 'https://developer.nvidia.com/cudnn'
description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is
a GPU-accelerated library of primitives for deep neural networks."""

toolchain = SYSTEM

source_urls = [
'https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-%(cudnnarch)s/'
]
# note: cuDNN is tied to specific to CUDA versions,
# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions
sources = ['%(namelower)s-linux-%(cudnnarch)s-%(version)s_cuda%(cudamajver)s-archive.tar.xz']
checksums = [{
'%(namelower)s-linux-sbsa-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
'340c49b32c133b0321c5c5b00d14fb64887dcac83ee8fd24195d9191061f1ad7',
'%(namelower)s-linux-x86_64-%(version)s_cuda%(cudamajver)s-archive.tar.xz':
'35dd20b9c68324ae1288ac36f66ab1f318d2bfecfafb703a82617aa283272be4',
}]

dependencies = [('CUDA', '12.6.0')]

sanity_check_paths = {
'files': [
'include/cudnn.h', 'lib64/libcudnn_adv_static.a', 'lib64/libcudnn_cnn_static.a',
'lib64/libcudnn_engines_precompiled_static.a', 'lib64/libcudnn_engines_runtime_compiled_static.a',
'lib64/libcudnn_graph_static.a', 'lib64/libcudnn_heuristic_static.a', 'lib64/libcudnn_ops_static.a',
],
'dirs': ['include', 'lib64'],
}

moduleclass = 'numlib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
easyblock = 'Tarball'

name = 'cuSPARSELt'
version = '0.6.3.2'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://docs.nvidia.com/cuda/cusparselt/index.html'
description = """NVIDIA cuSPARSELt is a high-performance CUDA library dedicated to general matrix-matrix operations in
which at least one operand is a sparse matrix"""

toolchain = SYSTEM

local_arch = {'arm64': 'sbsa', 'aarch64': 'sbsa'}.get(ARCH, ARCH)
source_urls = ['https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-%s/' % local_arch]
sources = ['libcusparse_lt-linux-%s-%%(version)s-archive.tar.xz' % local_arch]
checksums = [{
'libcusparse_lt-linux-x86_64-%(version)s-archive.tar.xz':
'a2f856e78943f5c538bdef1c9edc64a5ed30bf8bb7d5fcb615c684ffe776cc31',
'libcusparse_lt-linux-sbsa-%(version)s-archive.tar.xz':
'3e420ddbff4eb9ac603f57c7aa8b3d5271112816e244eb55ef9f30c4eb6a04b7',
}]

dependencies = [('CUDA', '12.6.0')]

sanity_check_paths = {
'files': ['include/cusparseLt.h',
'lib/libcusparseLt.%s' % SHLIB_EXT,
'lib/libcusparseLt_static.a'],
'dirs': [],
}

moduleclass = 'lib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name = 'NCCL'
version = '2.26.2'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://developer.nvidia.com/nccl'
description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
communication primitives that are performance optimized for NVIDIA GPUs."""

toolchain = {'name': 'GCCcore', 'version': '13.3.0'}

github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
checksums = ['74c6ab40c864d79c2139508e9419de5970cb406ec85f001d5f834d5f5c0c4f3b']

builddependencies = [('binutils', '2.42')]

dependencies = [
('CUDA', '12.6.0', '', SYSTEM),
('UCX-CUDA', '1.16.0', versionsuffix),
]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0']

moduleclass = 'lib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Avoid tripping on //caffe2/test/cpp/jit:test_custom_class_registrations with IS_SANDCASTLE

Author: Alexander Grund (TU Dresden)
--- a/torch/testing/_internal/torchbind_impls.py
+++ b/torch/testing/_internal/torchbind_impls.py
@@ -116,8 +116,6 @@ def load_torchbind_test_lib():

if IS_MACOS:
raise unittest.SkipTest("non-portable load_library call used in test")
- elif IS_SANDCASTLE or IS_FBCODE:
- lib_file_path = Path("//caffe2/test/cpp/jit:test_custom_class_registrations")
elif IS_WINDOWS:
lib_file_path = find_library_location("torchbind_test.dll")
else:
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
Don't checkout NCCL when using system NCCL

Author: Alexander Grund (TU Dresden)

diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py
index 5dd5a221975..2b8b868eaa8 100644
--- a/tools/build_pytorch_libs.py
+++ b/tools/build_pytorch_libs.py
@@ -7,7 +7,12 @@ from glob import glob
from pathlib import Path

from .setup_helpers.cmake import CMake, USE_NINJA
-from .setup_helpers.env import check_negative_env_flag, IS_64BIT, IS_WINDOWS
+from .setup_helpers.env import (
+ check_env_flag,
+ check_negative_env_flag,
+ IS_64BIT,
+ IS_WINDOWS,
+)


repo_root = Path(__file__).absolute().parent.parent
@@ -119,7 +124,12 @@ def build_pytorch(
cmake: CMake,
) -> None:
my_env = _create_build_env()
- checkout_nccl()
+ if (
+ not check_negative_env_flag("USE_CUDA")
+ and not check_negative_env_flag("USE_NCCL")
+ and not check_env_flag("USE_SYSTEM_NCCL")
+ ):
+ checkout_nccl()
build_test = not check_negative_env_flag("BUILD_TEST")
cmake.generate(
version, cmake_python_library, build_python, build_test, my_env, rerun_cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
If there are no GPUs there would be a WORLD_SIZE=0 which doesn't work.
Use a positive number for the NCCL/GLOO tests in that case.

See https://github.com/pytorch/pytorch/pull/150764

Author: Alexander Grund (TU Dresden)
diff --git a/test/run_test.py b/test/run_test.py
index a508d8db4d2..e7bbe6ea086 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -610,18 +610,19 @@ DISTRIBUTED_TESTS_CONFIG = {}


if dist.is_available():
+ num_gpus = torch.cuda.device_count()
DISTRIBUTED_TESTS_CONFIG["test"] = {"WORLD_SIZE": "1"}
if not TEST_WITH_ROCM and dist.is_mpi_available():
DISTRIBUTED_TESTS_CONFIG["mpi"] = {
"WORLD_SIZE": "3",
}
- if dist.is_nccl_available():
+ if dist.is_nccl_available() and num_gpus > 0:
DISTRIBUTED_TESTS_CONFIG["nccl"] = {
- "WORLD_SIZE": f"{torch.cuda.device_count()}",
+ "WORLD_SIZE": f"{num_gpus}",
}
- if dist.is_gloo_available():
+ if dist.is_gloo_available() and num_gpus > 0:
DISTRIBUTED_TESTS_CONFIG["gloo"] = {
# TODO: retire testing gloo with CUDA
- "WORLD_SIZE": f"{torch.cuda.device_count()}",
+ "WORLD_SIZE": f"{num_gpus}",
}
# Test with UCC backend is deprecated.
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
The decorators are implemented to run when the function is called which is after
the test `setup` method spawned subprocesses which may use NCCL to sync failing when there are
not enough GPUs available.
So replace the custom code by calls to the `unittest` skip decorators.
See https://github.com/pytorch/pytorch/pull/109491

Author: Alexander Grund (TU Dresden)
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index d34b1ffdb0a..8f9628f209b 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -155,17 +155,7 @@ def skip_if_odd_worldsize(func):


def require_n_gpus_for_nccl_backend(n, backend):
- def decorator(func):
- @wraps(func)
- def wrapper(*args, **kwargs):
- if backend == "nccl" and torch.cuda.device_count() < n:
- sys.exit(TEST_SKIPS[f"multi-gpu-{n}"].exit_code)
- else:
- return func(*args, **kwargs)
-
- return wrapper
-
- return decorator
+ return unittest.skipUnless(at_least_x_gpu(n), TEST_SKIPS[f"multi-gpu-{n}"].message) if backend == "nccl" else unittest.skipIf(False, None)


def import_transformers_or_skip():
@@ -197,20 +187,10 @@ def at_least_x_gpu(x):


def skip_if_lt_x_gpu(x):
- def decorator(func):
- @wraps(func)
- def wrapper(*args, **kwargs):
- if torch.cuda.is_available() and torch.cuda.device_count() >= x:
- return func(*args, **kwargs)
- if TEST_HPU and torch.hpu.device_count() >= x:
- return func(*args, **kwargs)
- if TEST_XPU and torch.xpu.device_count() >= x:
- return func(*args, **kwargs)
- sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
-
- return wrapper
-
- return decorator
+ return unittest.skipUnless(torch.cuda.device_count() >= x or (
+ TEST_HPU and torch.hpu.device_count() >= x) or (
+ TEST_XPU and torch.xpu.device_count() >= x),
+ TEST_SKIPS[f"multi-gpu-{x}"].message)


# This decorator helps avoiding initializing cuda while testing other backends
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index a4d6d53b975..0da1d9baddf 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -66,7 +66,6 @@ from torch.testing._internal.common_distributed import (
skip_if_small_worldsize,
skip_if_odd_worldsize,
skip_if_lt_x_gpu,
- nccl_skip_if_lt_x_gpu,
skip_if_no_gpu,
require_n_gpus_for_nccl_backend,
requires_nccl_version,
@@ -5299,7 +5298,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
- @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+ @require_n_gpus_for_nccl_backend(2, BACKEND)
def test_accumulate_gradients_no_sync(self):
"""
Runs _test_accumulate_gradients_no_sync using default inputs
@@ -5310,7 +5309,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
- @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+ @require_n_gpus_for_nccl_backend(2, BACKEND)
def test_accumulate_gradients_no_sync_grad_is_view(self):
"""
Runs _test_accumulate_gradients_no_sync using default inputs
@@ -5321,7 +5320,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
- @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+ @require_n_gpus_for_nccl_backend(2, BACKEND)
def test_accumulate_gradients_no_sync_allreduce_hook(self):
"""
Runs multiple iterations on _test_accumulate_gradients_no_sync
@@ -5349,7 +5348,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
- @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+ @require_n_gpus_for_nccl_backend(2, BACKEND)
def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self):
"""
Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce
@@ -5383,7 +5382,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
- @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+ @require_n_gpus_for_nccl_backend(2, BACKEND)
def test_get_future(self):
def mult(fut):
return [t * 3 for t in fut.wait()]
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
TestSelectAlgorithmCPU.test_linear_with_embedding fails when the CPU does not support BF16:
> torch._inductor.exc.InductorError: LoweringException: RuntimeError: self and mat2 must have the same dtype, but got Float and BFloat16
See https://github.com/pytorch/pytorch/issues/147104

Convert the embedding layer to avoid it using "Float" and adapt the check for this change.

Author: Alexander Grund (TU Dresden)
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -932,6 +932,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
def test_linear_with_embedding(
self, batch_size, in_features, out_features, bias, dtype
):
+ has_bf16 = torch.ops.mkldnn._is_mkldnn_bf16_supported()
class M(torch.nn.Module):
def __init__(self, bias):
super().__init__()
@@ -939,6 +940,9 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
dtype=dtype
)
self.emb = torch.nn.Embedding(64, out_features)
+ if not has_bf16:
+ self.emb = self.emb.to(dtype=dtype)
+

def forward(self, idx, x):
return self.emb(idx) + self.linear(x)
@@ -953,7 +957,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
with verify(dtype) as (atol, rtol):
self.common(mod, (idx, x), atol=atol, rtol=rtol)
self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
- self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1)
+ self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1 if has_bf16 else 0)

@inductor_config.patch({"freezing": True})
@patches
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
This test fails when FlexiBLAS is used instead of MKL.
Adjust the expected count.
See https://github.com/pytorch/pytorch/pull/151548

Author: Alexander Grund (TU Dresden)
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -1300,7 +1304,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
rtol=rtol,
)
self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 2)
- self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2)
+ self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2 if TEST_MKL else 1)

@inductor_config.patch({"freezing": True})
@patches

Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
The test often times out and seems to be considered flaky by PyTorch:
https://github.com/pytorch/pytorch/issues/78068

Author: Alexander Grund (TU Dresden)
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
index 730b2c2c0ac..5f9b9545700 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -6,6 +6,7 @@ import itertools
import math
import pickle
import sys
+from unittest import skip

import torch
import torch.distributed as dist
@@ -2432,6 +2432,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
@with_comms
@skip_if_lt_x_gpu(4)
@requires_nccl()
+ @skip("Times out often")
def test_init_from_local_shards(self):
local_shard_metadata = ShardMetadata(
shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
Loading