From 6c142a7f67bfe4c7a39b304958d478cda4f753a7 Mon Sep 17 00:00:00 2001 From: Ake Sandgren Date: Fri, 3 Dec 2021 13:49:16 +0100 Subject: [PATCH 1/3] Make CUDA-aware-OpenMPI detection aware of UCX-CUDA --- .../PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb | 26 ++++++------ .../PyTorch-1.10.0_detect_ucx_cuda.patch | 42 +++++++++++++++++++ 2 files changed, 54 insertions(+), 14 deletions(-) create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_detect_ucx_cuda.patch diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb index cecec914c702..1ad536ddeaf6 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb @@ -33,9 +33,10 @@ patches = [ 'PyTorch-1.10.0_skip_failing_ops_tests.patch', 'PyTorch-1.10.0_skip_nan_tests_openblas.patch', 'PyTorch-1.10.0_skip_cmake_rpath.patch', + 'PyTorch-1.10.0_detect_ucx_cuda.patch', ] checksums = [ - None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone' + 'd15a2702256e6739cb77c8e58e060c0eecb7340c654e40cbf280c87791ab5dd0', # PyTorch-1.10.0.tar.gz 'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch # PyTorch-1.7.1_correctly-pass-jit_opt_level.patch @@ -51,18 +52,14 @@ checksums = [ '426c9ead1a74b656748d4c8bf8afd4303d8b9f2394ad22b21a845d07c8ca1d12', # PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch '67152215e4530a9b1d7349fb20864445fd815288f04ab9e96e45c73b2d87827a', - # PyTorch-1.10.0_fix-test-cond-cpu.patch - '51f83f5d5ef69656ef35b73f17e0671e70113798421be11ea4c7b56ffcc4da03', - # PyTorch-1.10.0_fix-vnni-detection.patch - '1f3664c0febfa2a3fc4c0cd3bae185f289716ac0b6c3d7e8fa1cee19ba62b7cc', + '51f83f5d5ef69656ef35b73f17e0671e70113798421be11ea4c7b56ffcc4da03', # PyTorch-1.10.0_fix-test-cond-cpu.patch + '1f3664c0febfa2a3fc4c0cd3bae185f289716ac0b6c3d7e8fa1cee19ba62b7cc', # PyTorch-1.10.0_fix-vnni-detection.patch # PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch 'e65afb01786f7f030ccb5faada1eb474bb0c418bcadcf1baaa71a4fa2f3f4240', - # PyTorch-1.10.0_skip_failing_ops_tests.patch - '399af94ffcef4a6db5226552c46f11e9b0f0f371b2d7924b9e5764d2281581ab', - # PyTorch-1.10.0_skip_nan_tests_openblas.patch - '7d3f83e3056d9e47a460790313238f28708beb596cafaa7ae55e374d368bbedf', - # PyTorch-1.10.0_skip_cmake_rpath.patch - 'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', + '399af94ffcef4a6db5226552c46f11e9b0f0f371b2d7924b9e5764d2281581ab', # PyTorch-1.10.0_skip_failing_ops_tests.patch + '7d3f83e3056d9e47a460790313238f28708beb596cafaa7ae55e374d368bbedf', # PyTorch-1.10.0_skip_nan_tests_openblas.patch + 'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch + 'aa9d2dc7b090ec40011ad37b884f91fade20d49af7c4a090c1d8a270806f0ae1', # PyTorch-1.10.0_detect_ucx_cuda.patch ] osdependencies = [OS_PKG_IBVERBS_DEV] @@ -74,6 +71,7 @@ builddependencies = [ dependencies = [ ('CUDA', '11.3.1', '', True), + ('UCX-CUDA', '1.10.0', versionsuffix), ('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions ('Python', '3.9.5'), ('protobuf', '3.17.3'), @@ -87,9 +85,9 @@ dependencies = [ ('numactl', '2.0.14'), ('FFmpeg', '4.3.2'), ('Pillow', '8.2.0'), - ('cuDNN', '8.2.1.32', '-CUDA-%(cudaver)s', True), - ('magma', '2.6.1', '-CUDA-%(cudaver)s'), - ('NCCL', '2.10.3', '-CUDA-%(cudaver)s'), + ('cuDNN', '8.2.1.32', versionsuffix, True), + ('magma', '2.6.1', versionsuffix), + ('NCCL', '2.10.3', versionsuffix), ('expecttest', '0.1.3'), ] diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_detect_ucx_cuda.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_detect_ucx_cuda.patch new file mode 100644 index 000000000000..dface972c4f1 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_detect_ucx_cuda.patch @@ -0,0 +1,42 @@ +Make PyTorch CUDA-aware-OpenMPI detection aware of UCX. +The old detection relies on the old pre-UCX smcuda btl from ompi_info. +Use ucx_info -d to find CUDA enabled UCX if there is no smcuda. +ucx_info -d only shows cuda_cpy when run on a node with a GPU available. + +Åke Sandgren, 2021-12-03 +diff -ru pytorch.orig/cmake/Dependencies.cmake pytorch/cmake/Dependencies.cmake +--- pytorch.orig/cmake/Dependencies.cmake 2021-11-29 09:13:59.000000000 +0100 ++++ pytorch/cmake/Dependencies.cmake 2021-12-03 13:37:47.646865417 +0100 +@@ -1072,16 +1072,29 @@ + find_program(OMPI_INFO + NAMES ompi_info + HINTS ${MPI_CXX_LIBRARIES}/../bin) ++ set(CUDA_MPI_FOUND False) + if(OMPI_INFO) + execute_process(COMMAND ${OMPI_INFO} + OUTPUT_VARIABLE _output) + if(_output MATCHES "smcuda") +- message(STATUS "Found OpenMPI with CUDA support built.") ++ set(CUDA_MPI_FOUND True) + else() +- message(WARNING "OpenMPI found, but it is not built with CUDA support.") +- set(CAFFE2_FORCE_FALLBACK_CUDA_MPI 1) ++ find_program(UCX_INFO NAMES ucx_info) ++ if(UCX_INFO) ++ execute_process(COMMAND ${UCX_INFO} -d ++ OUTPUT_VARIABLE _output) ++ if(_output MATCHES "cuda_cpy") ++ set(CUDA_MPI_FOUND True) ++ endif() ++ endif() + endif() + endif() ++ if (CUDA_MPI_FOUND) ++ message(STATUS "Found OpenMPI with CUDA support built.") ++ else() ++ message(WARNING "OpenMPI found, but it is not built with CUDA support.") ++ set(CAFFE2_FORCE_FALLBACK_CUDA_MPI 1) ++ endif() + else() + message(WARNING "Not compiling with MPI. Suppress this warning with -DUSE_MPI=OFF") + caffe2_update_option(USE_MPI OFF) From 953d5a89529d2052fc39ebead1d121962e12c088 Mon Sep 17 00:00:00 2001 From: Ake Sandgren Date: Fri, 3 Dec 2021 15:05:48 +0100 Subject: [PATCH 2/3] Fix checksum for PyTorch-1.10.0.tar.gz, it is generated so no checksum. --- .../p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb index 1ad536ddeaf6..c0ceb34e0eb0 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb @@ -36,7 +36,7 @@ patches = [ 'PyTorch-1.10.0_detect_ucx_cuda.patch', ] checksums = [ - 'd15a2702256e6739cb77c8e58e060c0eecb7340c654e40cbf280c87791ab5dd0', # PyTorch-1.10.0.tar.gz + None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone' 'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch # PyTorch-1.7.1_correctly-pass-jit_opt_level.patch From 2adab88a711a7f42f984ab73ff92c8b1c8653e2e Mon Sep 17 00:00:00 2001 From: Ake Sandgren Date: Mon, 6 Dec 2021 07:40:56 +0100 Subject: [PATCH 3/3] Fix space before inline comment. --- .../p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb index c0ceb34e0eb0..76aacc69a165 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb @@ -36,7 +36,7 @@ patches = [ 'PyTorch-1.10.0_detect_ucx_cuda.patch', ] checksums = [ - None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone' + None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone' 'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch # PyTorch-1.7.1_correctly-pass-jit_opt_level.patch