diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.0-foss-2022a-CUDA-11.7.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.0-foss-2022a-CUDA-11.7.0.eb new file mode 100644 index 000000000000..cf6a5d924458 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.0-foss-2022a-CUDA-11.7.0.eb @@ -0,0 +1,127 @@ +name = 'PyTorch' +version = '1.12.0' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://pytorch.org/' +description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration. +PyTorch is a deep learning framework that puts Python first.""" + +toolchain = {'name': 'foss', 'version': '2022a'} + +sources = [{ + 'filename': '%(name)s-%(version)s.tar.gz', + 'git_config': { + 'url': 'https://github.com/pytorch', + 'repo_name': 'pytorch', + 'tag': 'v%(version)s', + 'recursive': True, + }, +}] +patches = [ + '%(name)s-1.7.0_avoid-nan-in-test-torch.patch', + '%(name)s-1.7.0_disable-dev-shm-test.patch', + '%(name)s-1.8.1_dont-use-gpu-ccc-in-test.patch', + '%(name)s-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch', + '%(name)s-1.10.0_fix-test-dataloader-fixed-affinity.patch', + '%(name)s-1.10.0_skip_cmake_rpath.patch', + '%(name)s-1.11.0_increase-distributed-test-timeout.patch', + '%(name)s-1.11.0_increase_c10d_gloo_timeout.patch', + '%(name)s-1.11.0_disable_failing_jit_cuda_fuser_tests.patch', +] +checksums = [ + None, # PyTorch-1.12.0.tar.gz + 'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch + '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch + '89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch + # PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch + 'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea', + # PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch + '313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707', + 'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch + # PyTorch-1.11.0_increase-distributed-test-timeout.patch + '087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f', + # PyTorch-1.11.0_increase_c10d_gloo_timeout.patch + '20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953', + # PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch + 'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51', +] + +builddependencies = [ + ('CMake', '3.23.1'), + ('hypothesis', '6.46.7'), +] +dependencies = [ + ('CUDA', '11.7.0', '', True), + ('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions + ('Python', '3.10.4'), + ('protobuf', '3.19.4'), + ('protobuf-python', '3.19.4'), + ('pybind11', '2.9.2'), + ('SciPy-bundle', '2022.05'), + ('PyYAML', '6.0'), + ('MPFR', '4.1.0'), + ('GMP', '6.2.1'), + ('numactl', '2.0.14'), + ('FFmpeg', '4.4.2'), + ('Pillow', '9.1.1'), + ('cuDNN', '8.4.1.50', '-CUDA-%(cudaver)s', True), + ('magma', '2.6.2', '-CUDA-%(cudaver)s'), + ('NCCL', '2.12.12', '-CUDA-%(cudaver)s'), + ('expecttest', '0.1.3'), +] + +osdependencies = [('libibverbs-dev', 'libibverbs-devel', 'rdma-core-devel')] + +# default CUDA compute capabilities to use (override via --cuda-compute-capabilities) +cuda_compute_capabilities = [ + '3.5', + '3.7', + '5.2', + '6.0', + '6.1', + '7.0', + '7.2', + '7.5', + '8.0', + '8.6', +] + +custom_opts = ['USE_CUPTI_SO=1'] + +excluded_tests = { + '': [ + # Bad tests: https://github.com/pytorch/pytorch/issues/60260 + 'distributed/elastic/utils/distributed_test', + 'distributed/elastic/multiprocessing/api_test', + # These tests fail on A10s at the very least, they time out forever no matter how long the timeout is. + # Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html + # 'distributed/test_distributed_fork', + 'distributed/test_distributed_spawn', + # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079 + 'test_optim', + # Test from this suite timeout often. The process group backend is deprecated anyway + # 'distributed/rpc/test_process_group_agent', + # This test fails constently when run as part of the test suite, but succeeds when run interactively + 'test_model_dump', + # These tests appear flaky, possibly related to number of GPUs that are used + 'distributed/fsdp/test_fsdp_memory', + 'distributed/fsdp/test_fsdp_overlap', + ] +} + +runtest = "cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s" + +# several tests are known to be flaky, and fail in some contexts (like having multiple GPUs available), +# so we allow up to 400 (out of ~90k) tests to fail before treating the installation to be faulty +max_failed_tests = 400 + +# The readelf sanity check command can be taken out once the TestRPATH test from +# https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite +local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT +sanity_check_commands = [ + "readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2, +] + +tests = ['%(name)s-check-cpp-extension.py'] + +moduleclass = 'ai'