From 6c142a7f67bfe4c7a39b304958d478cda4f753a7 Mon Sep 17 00:00:00 2001
From: Ake Sandgren <ake.sandgren@hpc2n.umu.se>
Date: Fri, 3 Dec 2021 13:49:16 +0100
Subject: [PATCH 1/3] Make CUDA-aware-OpenMPI detection aware of UCX-CUDA

---
 .../PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb  | 26 ++++++------
 .../PyTorch-1.10.0_detect_ucx_cuda.patch      | 42 +++++++++++++++++++
 2 files changed, 54 insertions(+), 14 deletions(-)
 create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_detect_ucx_cuda.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb
index cecec914c702..1ad536ddeaf6 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb
@@ -33,9 +33,10 @@ patches = [
     'PyTorch-1.10.0_skip_failing_ops_tests.patch',
     'PyTorch-1.10.0_skip_nan_tests_openblas.patch',
     'PyTorch-1.10.0_skip_cmake_rpath.patch',
+    'PyTorch-1.10.0_detect_ucx_cuda.patch',
 ]
 checksums = [
-    None,  # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
+    'd15a2702256e6739cb77c8e58e060c0eecb7340c654e40cbf280c87791ab5dd0',  # PyTorch-1.10.0.tar.gz
     'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18',  # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
     '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a',  # PyTorch-1.7.0_disable-dev-shm-test.patch
     # PyTorch-1.7.1_correctly-pass-jit_opt_level.patch
@@ -51,18 +52,14 @@ checksums = [
     '426c9ead1a74b656748d4c8bf8afd4303d8b9f2394ad22b21a845d07c8ca1d12',
     # PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch
     '67152215e4530a9b1d7349fb20864445fd815288f04ab9e96e45c73b2d87827a',
-    # PyTorch-1.10.0_fix-test-cond-cpu.patch
-    '51f83f5d5ef69656ef35b73f17e0671e70113798421be11ea4c7b56ffcc4da03',
-    # PyTorch-1.10.0_fix-vnni-detection.patch
-    '1f3664c0febfa2a3fc4c0cd3bae185f289716ac0b6c3d7e8fa1cee19ba62b7cc',
+    '51f83f5d5ef69656ef35b73f17e0671e70113798421be11ea4c7b56ffcc4da03',  # PyTorch-1.10.0_fix-test-cond-cpu.patch
+    '1f3664c0febfa2a3fc4c0cd3bae185f289716ac0b6c3d7e8fa1cee19ba62b7cc',  # PyTorch-1.10.0_fix-vnni-detection.patch
     # PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch
     'e65afb01786f7f030ccb5faada1eb474bb0c418bcadcf1baaa71a4fa2f3f4240',
-    # PyTorch-1.10.0_skip_failing_ops_tests.patch
-    '399af94ffcef4a6db5226552c46f11e9b0f0f371b2d7924b9e5764d2281581ab',
-    # PyTorch-1.10.0_skip_nan_tests_openblas.patch
-    '7d3f83e3056d9e47a460790313238f28708beb596cafaa7ae55e374d368bbedf',
-    # PyTorch-1.10.0_skip_cmake_rpath.patch
-    'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448',
+    '399af94ffcef4a6db5226552c46f11e9b0f0f371b2d7924b9e5764d2281581ab',  # PyTorch-1.10.0_skip_failing_ops_tests.patch
+    '7d3f83e3056d9e47a460790313238f28708beb596cafaa7ae55e374d368bbedf',  # PyTorch-1.10.0_skip_nan_tests_openblas.patch
+    'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448',  # PyTorch-1.10.0_skip_cmake_rpath.patch
+    'aa9d2dc7b090ec40011ad37b884f91fade20d49af7c4a090c1d8a270806f0ae1',  # PyTorch-1.10.0_detect_ucx_cuda.patch
 ]
 
 osdependencies = [OS_PKG_IBVERBS_DEV]
@@ -74,6 +71,7 @@ builddependencies = [
 
 dependencies = [
     ('CUDA', '11.3.1', '', True),
+    ('UCX-CUDA', '1.10.0', versionsuffix),
     ('Ninja', '1.10.2'),  # Required for JIT compilation of C++ extensions
     ('Python', '3.9.5'),
     ('protobuf', '3.17.3'),
@@ -87,9 +85,9 @@ dependencies = [
     ('numactl', '2.0.14'),
     ('FFmpeg', '4.3.2'),
     ('Pillow', '8.2.0'),
-    ('cuDNN', '8.2.1.32', '-CUDA-%(cudaver)s', True),
-    ('magma', '2.6.1', '-CUDA-%(cudaver)s'),
-    ('NCCL', '2.10.3', '-CUDA-%(cudaver)s'),
+    ('cuDNN', '8.2.1.32', versionsuffix, True),
+    ('magma', '2.6.1', versionsuffix),
+    ('NCCL', '2.10.3', versionsuffix),
     ('expecttest', '0.1.3'),
 ]
 
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_detect_ucx_cuda.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_detect_ucx_cuda.patch
new file mode 100644
index 000000000000..dface972c4f1
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_detect_ucx_cuda.patch
@@ -0,0 +1,42 @@
+Make PyTorch CUDA-aware-OpenMPI detection aware of UCX.
+The old detection relies on the old pre-UCX smcuda btl from ompi_info.
+Use ucx_info -d to find CUDA enabled UCX if there is no smcuda.
+ucx_info -d only shows cuda_cpy when run on a node with a GPU available.
+
+Åke Sandgren, 2021-12-03
+diff -ru pytorch.orig/cmake/Dependencies.cmake pytorch/cmake/Dependencies.cmake
+--- pytorch.orig/cmake/Dependencies.cmake	2021-11-29 09:13:59.000000000 +0100
++++ pytorch/cmake/Dependencies.cmake	2021-12-03 13:37:47.646865417 +0100
+@@ -1072,16 +1072,29 @@
+     find_program(OMPI_INFO
+       NAMES ompi_info
+       HINTS ${MPI_CXX_LIBRARIES}/../bin)
++    set(CUDA_MPI_FOUND False)
+     if(OMPI_INFO)
+       execute_process(COMMAND ${OMPI_INFO}
+                       OUTPUT_VARIABLE _output)
+       if(_output MATCHES "smcuda")
+-        message(STATUS "Found OpenMPI with CUDA support built.")
++        set(CUDA_MPI_FOUND True)
+       else()
+-        message(WARNING "OpenMPI found, but it is not built with CUDA support.")
+-        set(CAFFE2_FORCE_FALLBACK_CUDA_MPI 1)
++        find_program(UCX_INFO NAMES ucx_info)
++        if(UCX_INFO)
++          execute_process(COMMAND ${UCX_INFO} -d
++                          OUTPUT_VARIABLE _output)
++          if(_output MATCHES "cuda_cpy")
++            set(CUDA_MPI_FOUND True)
++          endif()
++        endif()
+       endif()
+     endif()
++    if (CUDA_MPI_FOUND)
++      message(STATUS "Found OpenMPI with CUDA support built.")
++    else()
++      message(WARNING "OpenMPI found, but it is not built with CUDA support.")
++      set(CAFFE2_FORCE_FALLBACK_CUDA_MPI 1)
++    endif()
+   else()
+     message(WARNING "Not compiling with MPI. Suppress this warning with -DUSE_MPI=OFF")
+     caffe2_update_option(USE_MPI OFF)

From 953d5a89529d2052fc39ebead1d121962e12c088 Mon Sep 17 00:00:00 2001
From: Ake Sandgren <ake.sandgren@hpc2n.umu.se>
Date: Fri, 3 Dec 2021 15:05:48 +0100
Subject: [PATCH 2/3] Fix checksum for PyTorch-1.10.0.tar.gz, it is generated
 so no checksum.

---
 .../p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb
index 1ad536ddeaf6..c0ceb34e0eb0 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb
@@ -36,7 +36,7 @@ patches = [
     'PyTorch-1.10.0_detect_ucx_cuda.patch',
 ]
 checksums = [
-    'd15a2702256e6739cb77c8e58e060c0eecb7340c654e40cbf280c87791ab5dd0',  # PyTorch-1.10.0.tar.gz
+    None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
     'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18',  # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
     '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a',  # PyTorch-1.7.0_disable-dev-shm-test.patch
     # PyTorch-1.7.1_correctly-pass-jit_opt_level.patch

From 2adab88a711a7f42f984ab73ff92c8b1c8653e2e Mon Sep 17 00:00:00 2001
From: Ake Sandgren <ake.sandgren@hpc2n.umu.se>
Date: Mon, 6 Dec 2021 07:40:56 +0100
Subject: [PATCH 3/3] Fix space before inline comment.

---
 .../p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb
index c0ceb34e0eb0..76aacc69a165 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb
@@ -36,7 +36,7 @@ patches = [
     'PyTorch-1.10.0_detect_ucx_cuda.patch',
 ]
 checksums = [
-    None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
+    None,  # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
     'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18',  # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
     '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a',  # PyTorch-1.7.0_disable-dev-shm-test.patch
     # PyTorch-1.7.1_correctly-pass-jit_opt_level.patch