Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions easybuild/easyconfigs/c/CUDA/CUDA-11.5.2.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name = 'CUDA'
version = '11.5.2'
local_nv_version = '495.29.05'

homepage = 'https://developer.nvidia.com/cuda-toolkit'
description = """CUDA (formerly Compute Unified Device Architecture) is a parallel
computing platform and programming model created by NVIDIA and implemented by the
graphics processing units (GPUs) that they produce. CUDA gives developers access
to the virtual instruction set and memory of the parallel computational elements in CUDA GPUs."""

toolchain = SYSTEM

source_urls = ['https://developer.download.nvidia.com/compute/cuda/%(version)s/local_installers/']
sources = ['cuda_%%(version)s_%s_linux%%(cudaarch)s.run' % local_nv_version]
checksums = [
{
'cuda_%%(version)s_%s_linux.run' % local_nv_version:
'74959abf02bcba526f0a3aae322c7641b25da040ccd6236d07038f81997b73a6',
'cuda_%%(version)s_%s_linux_ppc64le.run' % local_nv_version:
'45c468f430436b3e95d5e485a6ba0ec1fa2b23dc6c551c1307b79996ecf0a7ed',
'cuda_%%(version)s_%s_linux_sbsa.run' % local_nv_version:
'31337c8bdc224fa1bd07bc4b6a745798392428118cc8ea0fa4446ee4ad47dd30',
}
]

moduleclass = 'system'
41 changes: 41 additions & 0 deletions easybuild/easyconfigs/c/cuDNN/cuDNN-8.4.1.50-CUDA-11.5.2.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name = 'cuDNN'
version = '8.4.1.50'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://developer.nvidia.com/cudnn'
description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is
a GPU-accelerated library of primitives for deep neural networks."""

toolchain = SYSTEM

# note: cuDNN 8.4.1 is not specific to CUDA 11.6,
# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions
local_short_ver = '.'.join(version.split('.')[:3])
source_urls = [
'https://developer.download.nvidia.com/compute/redist/cudnn/v%s/local_installers/11.6/' % local_short_ver,
]
sources = ['%(namelower)s-linux-%(cudnnarch)s-%(version)s_cuda11.6-archive.tar.xz']
checksums = [
{
'%(namelower)s-linux-x86_64-%(version)s_cuda11.6-archive.tar.xz':
'ec96d2376d81fca42bdd3d4c3d705a99b29a065bab57f920561c763e29c67d01',
'%(namelower)s-linux-ppc64le-%(version)s_cuda11.6-archive.tar.xz':
'8b806cbfdc81352bf76716d1e53b42537665d110c6ffc068be910505c10e1b98',
'%(namelower)s-linux-sbsa-%(version)s_cuda11.6-archive.tar.xz':
'0b1b9fac5b78974e2fdaaa74843db18f636ce8f3d999d62ff2a615b9978fc360',
}
]

dependencies = [('CUDA', '11.5.2')]

sanity_check_paths = {
'files': [
'include/cudnn.h', 'lib64/libcudnn_adv_infer_static.a', 'lib64/libcudnn_adv_train_static.a',
'lib64/libcudnn_cnn_infer_static.a', 'lib64/libcudnn_cnn_train_static.a',
'lib64/libcudnn_ops_infer_static.a', 'lib64/libcudnn_ops_train_static.a',
'lib64/libcudnn.%s' % SHLIB_EXT
],
'dirs': ['include', 'lib64'],
}

moduleclass = 'numlib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
easyblock = "CMakeMake"

name = 'magma'
version = '2.6.2'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://icl.cs.utk.edu/magma/'
description = """The MAGMA project aims to develop a dense linear algebra library similar to
LAPACK but for heterogeneous/hybrid architectures, starting with current Multicore+GPU systems."""

toolchain = {'name': 'foss', 'version': '2021b'}
toolchainopts = {'pic': True, 'openmp': True}

source_urls = ['https://icl.cs.utk.edu/projectsfiles/magma/downloads/']
sources = [SOURCE_TAR_GZ]
patches = ['magma-2.6.1_allow-all-sms.patch']
checksums = [
'75b554dab00903e2d10b972c913e50e7f88cbc62f3ae432b5a086c7e4eda0a71', # magma-2.6.2.tar.gz
'b89285bac007b68e88e3b5ddbb7f94dbc8a9d77590e58c352e477574d8dca738', # magma-2.6.1_allow-all-sms.patch
]

builddependencies = [
('CMake', '3.21.1'),
]

dependencies = [
('CUDA', '11.5.2', '', SYSTEM),
]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['3.5', '5.0', '6.0', '7.0', '7.5', '8.0', '8.6']
# make sure both static and shared libs are built
configopts = [
'-DBUILD_SHARED_LIBS=%s -DGPU_TARGET="%%(cuda_sm_space_sep)s" ' % local_shared for local_shared in ('ON', 'OFF')
]

sanity_check_paths = {
'files': ['lib/libmagma.%s' % SHLIB_EXT, 'lib/libmagma.a'],
'dirs': ['include'],
}

moduleclass = 'math'
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name = 'NCCL'
version = '2.10.3'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://developer.nvidia.com/nccl'
description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
communication primitives that are performance optimized for NVIDIA GPUs."""

toolchain = {'name': 'GCCcore', 'version': '11.2.0'}

github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
checksums = ['55de166eb7dcab9ecef2629cdb5fb0c5ebec4fae03589c469ebe5dcb5716b3c5']

builddependencies = [('binutils', '2.37')]

dependencies = [
('CUDA', '11.5.2', '', SYSTEM),
('UCX-CUDA', '1.11.2', versionsuffix),
]

moduleclass = 'lib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
name = 'PyTorch'
version = '1.12.1'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2021b'}

source_urls = [GITHUB_RELEASE]
sources = ['%(namelower)s-v%(version)s.tar.gz']

patches = [
'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.10.0_fix-kineto-crash.patch',
'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
'PyTorch-1.10.0_fix-test-model_dump.patch',
'PyTorch-1.10.0_fix-vsx-vector-functions.patch',
'PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch',
'PyTorch-1.11.0_fix-fsdp-fp16-test.patch',
'PyTorch-1.11.0_fix-test_utils.patch',
'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch',
'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
'PyTorch-1.11.0_install-vsx-vec-headers.patch',
'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
'PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch',
'PyTorch-1.12.1_fix-cuda-gcc-version-check.patch',
'PyTorch-1.12.1_fix-skip-decorators.patch',
'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
'PyTorch-1.12.1_fix-test_wishart_log_prob.patch',
'PyTorch-1.12.1_fix-TestCudaFuser.test_unary_ops.patch',
'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch',
'PyTorch-1.12.1_fix-vsx-vector-funcs.patch',
'PyTorch-1.12.1_fix-vsx-loadu.patch',
'PyTorch-1.12.1_increase-test-adadelta-tolerance.patch',
'PyTorch-1.12.1_increase-tolerance-test_ops.patch',
'PyTorch-1.12.1_no-cuda-stubs-rpath.patch',
'PyTorch-1.12.1_python-3.10-annotation-fix.patch',
'PyTorch-1.12.1_python-3.10-compat.patch',
'PyTorch-1.12.1_remove-flaky-test-in-testnn.patch',
'PyTorch-1.12.1_skip-ao-sparsity-test-without-fbgemm.patch',
'PyTorch-1.12.1_skip-failing-grad-test.patch',
'PyTorch-1.12.1_skip-test_round_robin.patch',
]
checksums = [
'031c71073db73da732b5d01710220564ce6dd88d812ba053f0cc94296401eccb', # pytorch-v1.12.1.tar.gz
'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
'622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch
# PyTorch-1.10.0_fix-kineto-crash.patch
'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb',
# PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
'313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
# PyTorch-1.10.0_fix-test-model_dump.patch
'339148ae1a028cda6e750ac93fa38a599f66c7abe26586c9219f1a206ea14557',
# PyTorch-1.10.0_fix-vsx-vector-functions.patch
'7bef5f96cb83b2d655d2f76dd7468a171d446f0b3e06da2232ec7f886484d312',
# PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch
'34ba476a7bcddec323bf9eca083cb4623d0f569d081aa3add3769c24f22849d2',
'bb1c4e6d6fd4b0cf57ff8b824c797331b533bb1ffc63f5db0bae3aee10c3dc13', # PyTorch-1.11.0_fix-fsdp-fp16-test.patch
'4f7e25c4e2eb7094f92607df74488c6a4a35849fabf05fcf6c3655fa3f44a861', # PyTorch-1.11.0_fix-test_utils.patch
# PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
'20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953',
# PyTorch-1.11.0_increase-distributed-test-timeout.patch
'087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f',
'f2e6b9625733d9a471bb75e1ea20e28814cf1380b4f9089aa838ee35ddecf07d', # PyTorch-1.11.0_install-vsx-vec-headers.patch
# PyTorch-1.11.1_skip-test_init_from_local_shards.patch
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7',
# PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch
'd97cd6b0570a167ecc3e631dc4ea884d95ace285cc38aa980566f4fec2c0d089',
# PyTorch-1.12.1_fix-cuda-gcc-version-check.patch
'a650f4576f06c749f244cada52ff9c02499fa8f182019129488db3845e0756ab',
'e3ca6e42b2fa592ea095939fb59ab875668a058479407db3f3684cc5c6f4146c', # PyTorch-1.12.1_fix-skip-decorators.patch
# PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83',
# PyTorch-1.12.1_fix-test_wishart_log_prob.patch
'cf475ae6e6234b96c8d1bf917597c5176c94b3ccd940b72f2e1cd0c979580f45',
# PyTorch-1.12.1_fix-TestCudaFuser.test_unary_ops.patch
'8e6e844c6b0541e0c8115911ee1a9d548613254b36dfbdada202fd723fc26aa2',
'75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535', # PyTorch-1.12.1_fix-TestTorch.test_to.patch
# PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch
'0bd7e88b92c4c6f0fecf01746009858ba19f2df68b10b88c41485328a531875d',
'caccbf60f62eac313896c1eaec78b08f5d0fdfcb907079087490bb13d1561aa2', # PyTorch-1.12.1_fix-vsx-vector-funcs.patch
'8bfe3c94ada1dd1f7974a1261a8b576fb7ae944050fa1c7830fca033831123b2', # PyTorch-1.12.1_fix-vsx-loadu.patch
# PyTorch-1.12.1_increase-test-adadelta-tolerance.patch
'944ed1af5ad4bbe20cbb042764a88dad1eef6cd33218617cf3d4cd90c6764695',
# PyTorch-1.12.1_increase-tolerance-test_ops.patch
'1c1fa520801e2ee5faf56a3d6dc96321e7c11664fd16bffd7c6ee437e68357fb',
'2905826ca713752b47c84e4ec8b177c90cbd91fca498ba2ba546f495c4cf70a6', # PyTorch-1.12.1_no-cuda-stubs-rpath.patch
# PyTorch-1.12.1_python-3.10-annotation-fix.patch
'11e168fd429d9e156fc79dd806b08125f3640651ad9998abd810446b2ed0c2d7',
'81402420a878b40f824778f0333fbec6504325a6a1b06a22749c4cac3eaccf67', # PyTorch-1.12.1_python-3.10-compat.patch
# PyTorch-1.12.1_remove-flaky-test-in-testnn.patch
'e81b678e354dd137c0d6d974605cdedbf672096fdbdf567c347bc2fbfc73471d',
# PyTorch-1.12.1_skip-ao-sparsity-test-without-fbgemm.patch
'edd464ec8c37b44c07a72008d732604f6837f2dd61c7810c391a86ba4945ca39',
'1c89e7e67287fe6b9a95480a4178d3653b94d0ab2fe68edf227606c8ae548fdc', # PyTorch-1.12.1_skip-failing-grad-test.patch
# PyTorch-1.12.1_skip-test_round_robin.patch
'63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349',
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.22.1'),
('hypothesis', '6.14.6'),
]

dependencies = [
('CUDA', '11.5.2', '', SYSTEM),
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions
('Python', '3.9.6'),
('protobuf', '3.17.3'),
('protobuf-python', '3.17.3'),
('pybind11', '2.7.1'),
('SciPy-bundle', '2021.10'),
('typing-extensions', '3.10.0.2'),
('PyYAML', '5.4.1'),
('MPFR', '4.1.0'),
('GMP', '6.2.1'),
('numactl', '2.0.14'),
('FFmpeg', '4.3.2'),
('Pillow', '8.3.2'),
('cuDNN', '8.4.1.50', '-CUDA-%(cudaver)s', SYSTEM),
('magma', '2.6.2', '-CUDA-%(cudaver)s'),
('NCCL', '2.10.3', '-CUDA-%(cudaver)s'),
('expecttest', '0.1.3'),
]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['3.5', '3.7', '5.2', '6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6']

excluded_tests = {
'': [
# This test seems to take too long on NVIDIA Ampere at least.
'distributed/test_distributed_spawn',
# Broken on CUDA 11.5+(?): https://github.com/pytorch/pytorch/issues/75375
'distributions/test_constraints',
# Those 2 abort on some machines. Skip for now
'distributed/fsdp/test_fsdp_input',
'distributed/fsdp/test_fsdp_mixed_precision',
Comment thread
casparvl marked this conversation as resolved.
# Produces a single test failure on some systems
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17615
'distributed/fsdp/test_fsdp_core',
# failing on broadwell
Comment thread
casparvl marked this conversation as resolved.
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17615
'test_native_mha'
]
Comment thread
branfosj marked this conversation as resolved.
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/87593 is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]

tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'ai'
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
easyblock = 'EB_UCX_Plugins'

name = 'UCX-CUDA'
version = '1.11.2'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'http://www.openucx.org/'
description = """Unified Communication X
An open-source production grade communication framework for data centric
and high-performance applications

This module adds the UCX CUDA support.
"""

toolchain = {'name': 'GCCcore', 'version': '11.2.0'}
toolchainopts = {'pic': True}

source_urls = ['https://github.com/openucx/ucx/releases/download/v%(version)s']
sources = [{'filename': 'ucx-%(version)s.tar.gz', 'alt_location': 'UCX'}]
patches = [
'%(name)s-1.11.0_link_against_existing_UCX_libs.patch',
]
checksums = [
{'ucx-1.11.2.tar.gz': 'deebf86a5344fc2bd9e55449f88c650c4514928592807c9bc6fe4190e516c6df'},
{'UCX-CUDA-1.11.0_link_against_existing_UCX_libs.patch':
'457187fa020e526609ba91e7750c9941d57bd57d60d6eed317b40ad8824aca93'},
]

builddependencies = [
('binutils', '2.37'),
('Autotools', '20210726'),
('pkg-config', '0.29.2'),
]

dependencies = [
('zlib', '1.2.11'),
('UCX', version),
('CUDA', '11.5.2', '', SYSTEM),
('GDRCopy', '2.3'),
]

moduleclass = 'lib'