easybuilders · casparvl · Mar 29, 2023 · Feb 7, 2023 · Feb 10, 2023 · Mar 28, 2023
diff --git a/easybuild/easyconfigs/c/CUDA/CUDA-11.5.2.eb b/easybuild/easyconfigs/c/CUDA/CUDA-11.5.2.eb
@@ -0,0 +1,26 @@
+name = 'CUDA'
+version = '11.5.2'
+local_nv_version = '495.29.05'
+
+homepage = 'https://developer.nvidia.com/cuda-toolkit'
+description = """CUDA (formerly Compute Unified Device Architecture) is a parallel
+ computing platform and programming model created by NVIDIA and implemented by the
+ graphics processing units (GPUs) that they produce. CUDA gives developers access
+ to the virtual instruction set and memory of the parallel computational elements in CUDA GPUs."""
+
+toolchain = SYSTEM
+
+source_urls = ['https://developer.download.nvidia.com/compute/cuda/%(version)s/local_installers/']
+sources = ['cuda_%%(version)s_%s_linux%%(cudaarch)s.run' % local_nv_version]
+checksums = [
+    {
+        'cuda_%%(version)s_%s_linux.run' % local_nv_version:
+            '74959abf02bcba526f0a3aae322c7641b25da040ccd6236d07038f81997b73a6',
+        'cuda_%%(version)s_%s_linux_ppc64le.run' % local_nv_version:
+            '45c468f430436b3e95d5e485a6ba0ec1fa2b23dc6c551c1307b79996ecf0a7ed',
+        'cuda_%%(version)s_%s_linux_sbsa.run' % local_nv_version:
+            '31337c8bdc224fa1bd07bc4b6a745798392428118cc8ea0fa4446ee4ad47dd30',
+    }
+]
+
+moduleclass = 'system'
diff --git a/easybuild/easyconfigs/c/cuDNN/cuDNN-8.4.1.50-CUDA-11.5.2.eb b/easybuild/easyconfigs/c/cuDNN/cuDNN-8.4.1.50-CUDA-11.5.2.eb
@@ -0,0 +1,41 @@
+name = 'cuDNN'
+version = '8.4.1.50'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://developer.nvidia.com/cudnn'
+description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is
+a GPU-accelerated library of primitives for deep neural networks."""
+
+toolchain = SYSTEM
+
+# note: cuDNN 8.4.1 is not specific to CUDA 11.6,
+# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions
+local_short_ver = '.'.join(version.split('.')[:3])
+source_urls = [
+    'https://developer.download.nvidia.com/compute/redist/cudnn/v%s/local_installers/11.6/' % local_short_ver,
+]
+sources = ['%(namelower)s-linux-%(cudnnarch)s-%(version)s_cuda11.6-archive.tar.xz']
+checksums = [
+    {
+        '%(namelower)s-linux-x86_64-%(version)s_cuda11.6-archive.tar.xz':
+            'ec96d2376d81fca42bdd3d4c3d705a99b29a065bab57f920561c763e29c67d01',
+        '%(namelower)s-linux-ppc64le-%(version)s_cuda11.6-archive.tar.xz':
+            '8b806cbfdc81352bf76716d1e53b42537665d110c6ffc068be910505c10e1b98',
+        '%(namelower)s-linux-sbsa-%(version)s_cuda11.6-archive.tar.xz':
+            '0b1b9fac5b78974e2fdaaa74843db18f636ce8f3d999d62ff2a615b9978fc360',
+    }
+]
+
+dependencies = [('CUDA', '11.5.2')]
+
+sanity_check_paths = {
+    'files': [
+        'include/cudnn.h', 'lib64/libcudnn_adv_infer_static.a', 'lib64/libcudnn_adv_train_static.a',
+        'lib64/libcudnn_cnn_infer_static.a', 'lib64/libcudnn_cnn_train_static.a',
+        'lib64/libcudnn_ops_infer_static.a', 'lib64/libcudnn_ops_train_static.a',
+        'lib64/libcudnn.%s' % SHLIB_EXT
+    ],
+    'dirs': ['include', 'lib64'],
+}
+
+moduleclass = 'numlib'
diff --git a/easybuild/easyconfigs/m/magma/magma-2.6.2-foss-2021b-CUDA-11.5.2.eb b/easybuild/easyconfigs/m/magma/magma-2.6.2-foss-2021b-CUDA-11.5.2.eb
@@ -0,0 +1,42 @@
+easyblock = "CMakeMake"
+
+name = 'magma'
+version = '2.6.2'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://icl.cs.utk.edu/magma/'
+description = """The MAGMA project aims to develop a dense linear algebra library similar to
+ LAPACK but for heterogeneous/hybrid architectures, starting with current Multicore+GPU systems."""
+
+toolchain = {'name': 'foss', 'version': '2021b'}
+toolchainopts = {'pic': True, 'openmp': True}
+
+source_urls = ['https://icl.cs.utk.edu/projectsfiles/magma/downloads/']
+sources = [SOURCE_TAR_GZ]
+patches = ['magma-2.6.1_allow-all-sms.patch']
+checksums = [
+    '75b554dab00903e2d10b972c913e50e7f88cbc62f3ae432b5a086c7e4eda0a71',  # magma-2.6.2.tar.gz
+    'b89285bac007b68e88e3b5ddbb7f94dbc8a9d77590e58c352e477574d8dca738',  # magma-2.6.1_allow-all-sms.patch
+]
+
+builddependencies = [
+    ('CMake', '3.21.1'),
+]
+
+dependencies = [
+    ('CUDA', '11.5.2', '', SYSTEM),
+]
+
+# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
+cuda_compute_capabilities = ['3.5', '5.0', '6.0', '7.0', '7.5', '8.0', '8.6']
+# make sure both static and shared libs are built
+configopts = [
+    '-DBUILD_SHARED_LIBS=%s -DGPU_TARGET="%%(cuda_sm_space_sep)s" ' % local_shared for local_shared in ('ON', 'OFF')
+]
+
+sanity_check_paths = {
+    'files': ['lib/libmagma.%s' % SHLIB_EXT, 'lib/libmagma.a'],
+    'dirs': ['include'],
+}
+
+moduleclass = 'math'
diff --git a/easybuild/easyconfigs/n/NCCL/NCCL-2.10.3-GCCcore-11.2.0-CUDA-11.5.2.eb b/easybuild/easyconfigs/n/NCCL/NCCL-2.10.3-GCCcore-11.2.0-CUDA-11.5.2.eb
@@ -0,0 +1,23 @@
+name = 'NCCL'
+version = '2.10.3'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://developer.nvidia.com/nccl'
+description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
+communication primitives that are performance optimized for NVIDIA GPUs."""
+
+toolchain = {'name': 'GCCcore', 'version': '11.2.0'}
+
+github_account = 'NVIDIA'
+source_urls = [GITHUB_SOURCE]
+sources = ['v%(version)s-1.tar.gz']
+checksums = ['55de166eb7dcab9ecef2629cdb5fb0c5ebec4fae03589c469ebe5dcb5716b3c5']
+
+builddependencies = [('binutils', '2.37')]
+
+dependencies = [
+    ('CUDA', '11.5.2', '', SYSTEM),
+    ('UCX-CUDA', '1.11.2', versionsuffix),
+]
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2021b-CUDA-11.5.2.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2021b-CUDA-11.5.2.eb
@@ -0,0 +1,164 @@
+name = 'PyTorch'
+version = '1.12.1'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+
+toolchain = {'name': 'foss', 'version': '2021b'}
+
+source_urls = [GITHUB_RELEASE]
+sources = ['%(namelower)s-v%(version)s.tar.gz']
+
+patches = [
+    'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
+    'PyTorch-1.7.0_disable-dev-shm-test.patch',
+    'PyTorch-1.10.0_fix-kineto-crash.patch',
+    'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
+    'PyTorch-1.10.0_fix-test-model_dump.patch',
+    'PyTorch-1.10.0_fix-vsx-vector-functions.patch',
+    'PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch',
+    'PyTorch-1.11.0_fix-fsdp-fp16-test.patch',
+    'PyTorch-1.11.0_fix-test_utils.patch',
+    'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch',
+    'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
+    'PyTorch-1.11.0_install-vsx-vec-headers.patch',
+    'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
+    'PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch',
+    'PyTorch-1.12.1_fix-cuda-gcc-version-check.patch',
+    'PyTorch-1.12.1_fix-skip-decorators.patch',
+    'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
+    'PyTorch-1.12.1_fix-test_wishart_log_prob.patch',
+    'PyTorch-1.12.1_fix-TestCudaFuser.test_unary_ops.patch',
+    'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
+    'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch',
+    'PyTorch-1.12.1_fix-vsx-vector-funcs.patch',
+    'PyTorch-1.12.1_fix-vsx-loadu.patch',
+    'PyTorch-1.12.1_increase-test-adadelta-tolerance.patch',
+    'PyTorch-1.12.1_increase-tolerance-test_ops.patch',
+    'PyTorch-1.12.1_no-cuda-stubs-rpath.patch',
+    'PyTorch-1.12.1_python-3.10-annotation-fix.patch',
+    'PyTorch-1.12.1_python-3.10-compat.patch',
+    'PyTorch-1.12.1_remove-flaky-test-in-testnn.patch',
+    'PyTorch-1.12.1_skip-ao-sparsity-test-without-fbgemm.patch',
+    'PyTorch-1.12.1_skip-failing-grad-test.patch',
+    'PyTorch-1.12.1_skip-test_round_robin.patch',
+]
+checksums = [
+    '031c71073db73da732b5d01710220564ce6dd88d812ba053f0cc94296401eccb',  # pytorch-v1.12.1.tar.gz
+    'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18',  # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
+    '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a',  # PyTorch-1.7.0_disable-dev-shm-test.patch
+    # PyTorch-1.10.0_fix-kineto-crash.patch
+    'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb',
+    # PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
+    '313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
+    # PyTorch-1.10.0_fix-test-model_dump.patch
+    '339148ae1a028cda6e750ac93fa38a599f66c7abe26586c9219f1a206ea14557',
+    # PyTorch-1.10.0_fix-vsx-vector-functions.patch
+    '7bef5f96cb83b2d655d2f76dd7468a171d446f0b3e06da2232ec7f886484d312',
+    # PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch
+    '34ba476a7bcddec323bf9eca083cb4623d0f569d081aa3add3769c24f22849d2',
+    'bb1c4e6d6fd4b0cf57ff8b824c797331b533bb1ffc63f5db0bae3aee10c3dc13',  # PyTorch-1.11.0_fix-fsdp-fp16-test.patch
+    '4f7e25c4e2eb7094f92607df74488c6a4a35849fabf05fcf6c3655fa3f44a861',  # PyTorch-1.11.0_fix-test_utils.patch
+    # PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
+    '20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953',
+    # PyTorch-1.11.0_increase-distributed-test-timeout.patch
+    '087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f',
+    'f2e6b9625733d9a471bb75e1ea20e28814cf1380b4f9089aa838ee35ddecf07d',  # PyTorch-1.11.0_install-vsx-vec-headers.patch
+    # PyTorch-1.11.1_skip-test_init_from_local_shards.patch
+    '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7',
+    # PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch
+    'd97cd6b0570a167ecc3e631dc4ea884d95ace285cc38aa980566f4fec2c0d089',
+    # PyTorch-1.12.1_fix-cuda-gcc-version-check.patch
+    'a650f4576f06c749f244cada52ff9c02499fa8f182019129488db3845e0756ab',
+    'e3ca6e42b2fa592ea095939fb59ab875668a058479407db3f3684cc5c6f4146c',  # PyTorch-1.12.1_fix-skip-decorators.patch
+    # PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch
+    '1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83',
+    # PyTorch-1.12.1_fix-test_wishart_log_prob.patch
+    'cf475ae6e6234b96c8d1bf917597c5176c94b3ccd940b72f2e1cd0c979580f45',
+    # PyTorch-1.12.1_fix-TestCudaFuser.test_unary_ops.patch
+    '8e6e844c6b0541e0c8115911ee1a9d548613254b36dfbdada202fd723fc26aa2',
+    '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535',  # PyTorch-1.12.1_fix-TestTorch.test_to.patch
+    # PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch
+    '0bd7e88b92c4c6f0fecf01746009858ba19f2df68b10b88c41485328a531875d',
+    'caccbf60f62eac313896c1eaec78b08f5d0fdfcb907079087490bb13d1561aa2',  # PyTorch-1.12.1_fix-vsx-vector-funcs.patch
+    '8bfe3c94ada1dd1f7974a1261a8b576fb7ae944050fa1c7830fca033831123b2',  # PyTorch-1.12.1_fix-vsx-loadu.patch
+    # PyTorch-1.12.1_increase-test-adadelta-tolerance.patch
+    '944ed1af5ad4bbe20cbb042764a88dad1eef6cd33218617cf3d4cd90c6764695',
+    # PyTorch-1.12.1_increase-tolerance-test_ops.patch
+    '1c1fa520801e2ee5faf56a3d6dc96321e7c11664fd16bffd7c6ee437e68357fb',
+    '2905826ca713752b47c84e4ec8b177c90cbd91fca498ba2ba546f495c4cf70a6',  # PyTorch-1.12.1_no-cuda-stubs-rpath.patch
+    # PyTorch-1.12.1_python-3.10-annotation-fix.patch
+    '11e168fd429d9e156fc79dd806b08125f3640651ad9998abd810446b2ed0c2d7',
+    '81402420a878b40f824778f0333fbec6504325a6a1b06a22749c4cac3eaccf67',  # PyTorch-1.12.1_python-3.10-compat.patch
+    # PyTorch-1.12.1_remove-flaky-test-in-testnn.patch
+    'e81b678e354dd137c0d6d974605cdedbf672096fdbdf567c347bc2fbfc73471d',
+    # PyTorch-1.12.1_skip-ao-sparsity-test-without-fbgemm.patch
+    'edd464ec8c37b44c07a72008d732604f6837f2dd61c7810c391a86ba4945ca39',
+    '1c89e7e67287fe6b9a95480a4178d3653b94d0ab2fe68edf227606c8ae548fdc',  # PyTorch-1.12.1_skip-failing-grad-test.patch
+    # PyTorch-1.12.1_skip-test_round_robin.patch
+    '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349',
+]
+
+osdependencies = [OS_PKG_IBVERBS_DEV]
+
+builddependencies = [
+    ('CMake', '3.22.1'),
+    ('hypothesis', '6.14.6'),
+]
+
+dependencies = [
+    ('CUDA', '11.5.2', '', SYSTEM),
+    ('Ninja', '1.10.2'),  # Required for JIT compilation of C++ extensions
+    ('Python', '3.9.6'),
+    ('protobuf', '3.17.3'),
+    ('protobuf-python', '3.17.3'),
+    ('pybind11', '2.7.1'),
+    ('SciPy-bundle', '2021.10'),
+    ('typing-extensions', '3.10.0.2'),
+    ('PyYAML', '5.4.1'),
+    ('MPFR', '4.1.0'),
+    ('GMP', '6.2.1'),
+    ('numactl', '2.0.14'),
+    ('FFmpeg', '4.3.2'),
+    ('Pillow', '8.3.2'),
+    ('cuDNN', '8.4.1.50', '-CUDA-%(cudaver)s', SYSTEM),
+    ('magma', '2.6.2', '-CUDA-%(cudaver)s'),
+    ('NCCL', '2.10.3', '-CUDA-%(cudaver)s'),
+    ('expecttest', '0.1.3'),
+]
+
+# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
+cuda_compute_capabilities = ['3.5', '3.7', '5.2', '6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6']
+
+excluded_tests = {
+    '': [
+        # This test seems to take too long on NVIDIA Ampere at least.
+        'distributed/test_distributed_spawn',
+        # Broken on CUDA 11.5+(?): https://github.com/pytorch/pytorch/issues/75375
+        'distributions/test_constraints',
+        # Those 2 abort on some machines. Skip for now
+        'distributed/fsdp/test_fsdp_input',
+        'distributed/fsdp/test_fsdp_mixed_precision',
+        # Produces a single test failure on some systems
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17615
+        'distributed/fsdp/test_fsdp_core',
+        # failing on broadwell
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17615
+        'test_native_mha'
+    ]
+}
+
+runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error  --verbose %(excluded_tests)s'
+
+# The readelf sanity check command can be taken out once the TestRPATH test from
+# https://github.com/pytorch/pytorch/pull/87593 is accepted, since it is then checked as part of the PyTorch test suite
+local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
+sanity_check_commands = [
+    "readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
+]
+
+tests = ['PyTorch-check-cpp-extension.py']
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/u/UCX-CUDA/UCX-CUDA-1.11.2-GCCcore-11.2.0-CUDA-11.5.2.eb b/easybuild/easyconfigs/u/UCX-CUDA/UCX-CUDA-1.11.2-GCCcore-11.2.0-CUDA-11.5.2.eb
@@ -0,0 +1,42 @@
+easyblock = 'EB_UCX_Plugins'
+
+name = 'UCX-CUDA'
+version = '1.11.2'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'http://www.openucx.org/'
+description = """Unified Communication X
+An open-source production grade communication framework for data centric
+and high-performance applications
+
+This module adds the UCX CUDA support.
+"""
+
+toolchain = {'name': 'GCCcore', 'version': '11.2.0'}
+toolchainopts = {'pic': True}
+
+source_urls = ['https://github.com/openucx/ucx/releases/download/v%(version)s']
+sources = [{'filename': 'ucx-%(version)s.tar.gz', 'alt_location': 'UCX'}]
+patches = [
+    '%(name)s-1.11.0_link_against_existing_UCX_libs.patch',
+]
+checksums = [
+    {'ucx-1.11.2.tar.gz': 'deebf86a5344fc2bd9e55449f88c650c4514928592807c9bc6fe4190e516c6df'},
+    {'UCX-CUDA-1.11.0_link_against_existing_UCX_libs.patch':
+     '457187fa020e526609ba91e7750c9941d57bd57d60d6eed317b40ad8824aca93'},
+]
+
+builddependencies = [
+    ('binutils', '2.37'),
+    ('Autotools', '20210726'),
+    ('pkg-config', '0.29.2'),
+]
+
+dependencies = [
+    ('zlib', '1.2.11'),
+    ('UCX', version),
+    ('CUDA',  '11.5.2', '', SYSTEM),
+    ('GDRCopy', '2.3'),
+]
+
+moduleclass = 'lib'