Skip to content
154 changes: 154 additions & 0 deletions easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022b.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
name = 'PyTorch'
version = '1.13.1'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2022b'}

source_urls = [GITHUB_RELEASE]
sources = ['%(namelower)s-v%(version)s.tar.gz']
patches = [
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.10.0_fix-kineto-crash.patch',
'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch',
'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch',
'PyTorch-1.12.1_add-hypothesis-suppression.patch',
'PyTorch-1.12.1_fix-skip-decorators.patch',
'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
'PyTorch-1.12.1_fix-test_wishart_log_prob.patch',
'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch',
'PyTorch-1.12.1_fix-vsx-loadu.patch',
'PyTorch-1.12.1_fix-vsx-vector-funcs.patch',
'PyTorch-1.12.1_skip-test_round_robin.patch',
'PyTorch-1.13.1_disable-test-sharding.patch',
'PyTorch-1.13.1_fix-flaky-jit-test.patch',
'PyTorch-1.13.1_fix-fsdp-fp16-test.patch',
'PyTorch-1.13.1_fix-gcc-12-missing-includes.patch',
'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
'PyTorch-1.13.1_fix-numpy-deprecations.patch',
'PyTorch-1.13.1_fix-protobuf-dependency.patch',
'PyTorch-1.13.1_fix-pytest-args.patch',
'PyTorch-1.13.1_fix-python-3.11-compat.patch',
'PyTorch-1.13.1_fix-test-ops-conf.patch',
'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
'PyTorch-1.13.1_increase-tolerance-test_ops.patch',
'PyTorch-1.13.1_install-vsx-vec-headers.patch',
'PyTorch-1.13.1_no-cuda-stubs-rpath.patch',
'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch',
'PyTorch-1.13.1_skip-failing-grad-test.patch',
'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
]
checksums = [
{'pytorch-v1.13.1.tar.gz': 'dbc229ee9750b02b514937d017744443a269ea0241ed3f32b9af0703589d25d4'},
{'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
{'PyTorch-1.10.0_fix-kineto-crash.patch': 'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb'},
{'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch':
'cc526130b6446bbbf5f0f7372d3aeee3e7d4c4d6e471524dff028b430b152934'},
{'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
{'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch':
'090598592283e3fc46ee08a68b6a6afe07be41b26514afba51834408bf1c98ed'},
{'PyTorch-1.12.1_add-hypothesis-suppression.patch':
'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
{'PyTorch-1.12.1_fix-skip-decorators.patch': 'e3ca6e42b2fa592ea095939fb59ab875668a058479407db3f3684cc5c6f4146c'},
{'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
{'PyTorch-1.12.1_fix-test_wishart_log_prob.patch':
'cf475ae6e6234b96c8d1bf917597c5176c94b3ccd940b72f2e1cd0c979580f45'},
{'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
{'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch':
'0bd7e88b92c4c6f0fecf01746009858ba19f2df68b10b88c41485328a531875d'},
{'PyTorch-1.12.1_fix-vsx-loadu.patch': '8bfe3c94ada1dd1f7974a1261a8b576fb7ae944050fa1c7830fca033831123b2'},
{'PyTorch-1.12.1_fix-vsx-vector-funcs.patch': 'caccbf60f62eac313896c1eaec78b08f5d0fdfcb907079087490bb13d1561aa2'},
{'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
{'PyTorch-1.13.1_disable-test-sharding.patch': 'df2074adeba47998ce2993d99ca64eb6f1c79ab7057f553b436efdec264d3572'},
{'PyTorch-1.13.1_fix-flaky-jit-test.patch': '71efdeb29b5e5b4982c9f5cb2182733654a34d52f85bb5487bc4d7d99b86101b'},
{'PyTorch-1.13.1_fix-fsdp-fp16-test.patch': '8ae68e60d6e1f92f50322b7f0381c7e65251fba32d7606e3a238a36a2f55b5cf'},
{'PyTorch-1.13.1_fix-gcc-12-missing-includes.patch':
'18df8c61ecaa9fb659346c1e172828bca6b069f0145bb8f6a36b0a23b7bef0a6'},
{'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
'5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
{'PyTorch-1.13.1_fix-numpy-deprecations.patch':
'f461b570efe0434ddd806bf2fa7020eb213e3ed89d0eb4403e076f4276ba2a46'},
{'PyTorch-1.13.1_fix-protobuf-dependency.patch':
'8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
{'PyTorch-1.13.1_fix-pytest-args.patch': 'd3e3c841cf8d73683750f29326f2be56ee0bb5df7ff522baf7d7c3f301a91ec2'},
{'PyTorch-1.13.1_fix-python-3.11-compat.patch':
'fa4eb0e27e00a90bb217b77c0023089c4659c03f37d781ab4a681bdcb4f0432f'},
{'PyTorch-1.13.1_fix-test-ops-conf.patch': 'df652eec7753864ebebbfeca546929a53e3fb8f24259d5c9b964266a8551198c'},
{'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
{'PyTorch-1.13.1_increase-tolerance-test_ops.patch':
'c909fdfc2b12df457e1eb5514265ffec3eab653994949416f3f048668421e223'},
{'PyTorch-1.13.1_install-vsx-vec-headers.patch':
'7b678f54bb947afd4767f5877ac424b4b94ce5db609ea20f5a869ccf4027035f'},
{'PyTorch-1.13.1_no-cuda-stubs-rpath.patch': '4c636059850fc9d1ecb27ce275f8aad5d5b6fdc19e35aff0c25b86cb3201352a'},
{'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch':
'be83ff61fe2dedab6d49c232936d5622df81ab49154264490021c6c828e53315'},
{'PyTorch-1.13.1_skip-failing-grad-test.patch': '6681200f9509893cb9231b5c93ac9bc5e6d9d9ae4febefca52e7cbc843ba8f51'},
{'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
'72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
{'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
'481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.24.3'),
('hypothesis', '6.68.2'),
# For tests
('pytest-rerunfailures', '12.0'),
('pytest-shard', '0.1.2'),
]

dependencies = [
('Ninja', '1.11.1'), # Required for JIT compilation of C++ extensions
('Python', '3.10.8'),
('protobuf', '23.0'),
('protobuf-python', '4.23.0'),
('pybind11', '2.10.3'),
('SciPy-bundle', '2023.02'),
('PyYAML', '6.0'),
('MPFR', '4.2.0'),
('GMP', '6.2.1'),
('numactl', '2.0.16'),
('FFmpeg', '5.1.2'),
('Pillow', '9.4.0'),
('expecttest', '0.1.3'),
]

custom_opts = ['CMAKE_CXX_STANDARD=17']

excluded_tests = {
'': [
# This test seems to take too long on NVIDIA Ampere at least.
'distributed/test_distributed_spawn',
# Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
'distributions/test_constraints',
# no xdoctest
'doctests',
# failing on broadwell
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'test_native_mha',
# intermittent failures on various systems
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'distributed/rpc/test_tensorpipe_agent',
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

# Especially test_quantization has a few corner cases that are triggered by the random input values,
# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
# So allow a low number of tests to fail as the tests "usually" succeed
max_failed_tests = 2

tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'ai'
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
Our error checking doesn't work well with the parallel/sharded pytorch test.
As the overall gain is low, disable it and always run the full test suite in a single process.
Additionally remove the fail-on-first-error flag (-x) to collect all errors.

Author: Alexander Grund (TU Dresden)

diff --git a/test/run_test.py b/test/run_test.py
index f7c80f3f0a6..9e930d774ed 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -718,18 +718,18 @@ def print_log_file(test: str, file_path: str, failed: bool) -> None:


def run_test_ops(test_module, test_directory, options):
- if 'slow-gradcheck' in os.getenv("BUILD_ENVIRONMENT", ""):
+ if True:
# there are a lot of tests that take up a lot of space in slowgrad check, so don't bother parallelizing
# it's also on periodic so we don't care about TTS as much
return run_test(test_module, test_directory, copy.deepcopy(options),
- extra_unittest_args=["--use-pytest", '-vv', '-x', '--reruns=2', '-rfEX'],
+ extra_unittest_args=["--use-pytest", '-vv', '--reruns=2', '-rfEX'],
)
return_codes = []
os.environ["NUM_PARALLEL_PROCS"] = str(NUM_PROCS)
pool = get_context("spawn").Pool(NUM_PROCS)
for i in range(NUM_PROCS):
return_code = pool.apply_async(run_test, args=(test_module, test_directory, copy.deepcopy(options)),
- kwds={"extra_unittest_args": ["--use-pytest", '-vv', '-x', '--reruns=2', '-rfEX',
+ kwds={"extra_unittest_args": ["--use-pytest", '-vv', '--reruns=2', '-rfEX',
f'--shard-id={i}', f'--num-shards={NUM_PROCS}',
"-k=not _linalg_cholesky_"],
})
@@ -742,7 +742,7 @@ def run_test_ops(test_module, test_directory, options):
if return_code.get() != 0:
return return_code.get()
return_code = run_test(test_module, test_directory, copy.deepcopy(options),
- extra_unittest_args=["--use-pytest", '-vv', '-x', '--reruns=2', '-rfEX',
+ extra_unittest_args=["--use-pytest", '-vv', '--reruns=2', '-rfEX',
"-k=_linalg_cholesky_"],
)
return return_code
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Add missing headers that are no longer transitively included in GCC 12

Author: Alexander Grund (TU Dresden)

Submodule third_party/gloo contains modified content
diff --git a/third_party/gloo/gloo/transport/ibverbs/pair.h b/third_party/gloo/gloo/transport/ibverbs/pair.h
index 1ccc050..54dbc7e 100644
--- a/third_party/gloo/gloo/transport/ibverbs/pair.h
+++ b/third_party/gloo/gloo/transport/ibverbs/pair.h
@@ -8,6 +8,7 @@

#pragma once

+#include <array>
#include <atomic>
#include <condition_variable>
#include <exception>
diff --git a/third_party/gloo/gloo/transport/tcp/device.cc b/third_party/gloo/gloo/transport/tcp/device.cc
index 05cf0a4..4408d60 100644
--- a/third_party/gloo/gloo/transport/tcp/device.cc
+++ b/third_party/gloo/gloo/transport/tcp/device.cc
@@ -8,6 +8,7 @@

#include "gloo/transport/tcp/device.h"

+#include <array>
#include <ifaddrs.h>
#include <netdb.h>
#include <netinet/in.h>
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
GCC 12 has a regression (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593)
leading to warnings/errors during build: https://github.com/pytorch/FBGEMM/issues/1666
Suppress the affected warning in FBGEMM.

Author: Alexander Grund (TU Dresden)

Submodule third_party/fbgemm contains modified content
diff --git a/third_party/fbgemm/CMakeLists.txt b/third_party/fbgemm/CMakeLists.txt
index 58dcb9ae..c888f0f8 100644
--- a/third_party/fbgemm/CMakeLists.txt
+++ b/third_party/fbgemm/CMakeLists.txt
@@ -147,10 +147,10 @@ else(MSVC)
string(APPEND CMAKE_CXX_FLAGS " -Werror")
string(APPEND CMAKE_CXX_FLAGS " -Wno-deprecated-declarations")
target_compile_options(fbgemm_avx2 PRIVATE
- "-m64" "-mavx2" "-mf16c" "-mfma")
+ "-m64" "-mavx2" "-mf16c" "-mfma" "-Wno-uninitialized")
target_compile_options(fbgemm_avx512 PRIVATE
"-m64" "-mavx2" "-mfma" "-mavx512f" "-mavx512bw" "-mavx512dq"
- "-mavx512vl")
+ "-mavx512vl" "-Wno-uninitialized")
set_source_files_properties(
src/FbgemmFP16UKernelsAvx2.cc
src/FbgemmFP16UKernelsAvx512.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
Numpy 1.24 removed some types deprecated earlier leading to failures in e.g. test_torch:
> ERROR: test_parsing_intlist (__main__.TestTorch)
> ...
> AttributeError: module 'numpy' has no attribute 'float'.

Backported from https://github.com/pytorch/pytorch/pull/93997
Author: Alexander Grund (TU Dresden)

diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 79297e073f0..827a781df10 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -3010,7 +3010,7 @@ class TestDynamicQuantizedOps(TestCase):
# W_scale = 1.0
# W_zp = 0
W_scales = np.ones(output_channels)
- W_zps = np.zeros(output_channels).astype(np.int)
+ W_zps = np.zeros(output_channels).astype(int)
W_value_min = -128
W_value_max = 127
W_q0 = np.round(
@@ -3581,9 +3581,9 @@ class TestQuantizedLinear(TestCase):
# xnnpack forces W_zp to 0 when using symmetric quantization
# ONEDNN only supports symmetric quantization of weight
if dtype == torch.qint8 or qengine_is_onednn():
- W_zps = np.zeros(output_channels).astype(np.int)
+ W_zps = np.zeros(output_channels).astype(int)
else:
- W_zps = np.round(np.random.rand(output_channels) * 100 - 50).astype(np.int)
+ W_zps = np.round(np.random.rand(output_channels) * 100 - 50).astype(int)
# when using symmetric quantization
# special restriction for xnnpack fully connected op weight
# [-127, 127] instead of [-128, 127]
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 0e36906f25f..20fcb6ef9ae 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1323,7 +1323,7 @@ class TestReductions(TestCase):
vals = [[True, True], [True, False], [False, False], []]
for val in vals:
result = torch.prod(torch.tensor(val, device=device), dtype=torch.bool).item()
- expect = np.prod(np.array(val), dtype=np.bool)
+ expect = np.prod(np.array(val), dtype=bool)
self.assertEqual(result, expect)

result = torch.prod(torch.tensor(val, device=device)).item()
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index aab26452496..a71761ce670 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -1440,14 +1440,14 @@ class TestTensorCreation(TestCase):
def test_ctor_with_numpy_array(self, device):
correct_dtypes = [
np.double,
- np.float,
+ float,
np.float16,
np.int64,
np.int32,
np.int16,
np.int8,
np.uint8,
- np.bool,
+ bool,
]

incorrect_byteorder = '>' if sys.byteorder == 'little' else '<'
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index e836b0f1ba8..0857873a5fa 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -796,7 +796,7 @@ class TestTensorBoardNumpy(BaseTestCase):
model = ModelHelper(name="mnist")
# how come those inputs don't break the forward pass =.=a
workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32))
- workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int))
+ workspace.FeedBlob("label", np.random.randn(1, 1000).astype(int))

with core.NameScope("conv1"):
conv1 = brew.conv(model, "data", 'conv1', dim_in=1, dim_out=20, kernel=5)
@@ -831,7 +831,7 @@ class TestTensorBoardNumpy(BaseTestCase):
def test_caffe2_simple_cnnmodel(self):
model = cnn.CNNModelHelper("NCHW", name="overfeat")
workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32))
- workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int))
+ workspace.FeedBlob("label", np.random.randn(1, 1000).astype(int))
with core.NameScope("conv1"):
conv1 = model.Conv("data", "conv1", 3, 96, 11, stride=4)
relu1 = model.Relu(conv1, conv1)
diff --git a/test/test_torch.py b/test/test_torch.py
index 8de5b822d00..3121e256b21 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6182,7 +6182,7 @@ class TestTorch(TestCase):
# fail parse with float variables
self.assertRaises(TypeError, lambda: torch.ones((torch.tensor(3.), torch.tensor(4))))
# fail parse with numpy floats
- self.assertRaises(TypeError, lambda: torch.ones((np.float(3.), torch.tensor(4))))
+ self.assertRaises(TypeError, lambda: torch.ones((3., torch.tensor(4))))
self.assertRaises(TypeError, lambda: torch.ones((np.array(3.), torch.tensor(4))))

# fail parse with > 1 element variables
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
Add proper dependency in libprotobuf target to get the correct set of flags.
Otherwise the build will fail with e.g.:
undefined reference to `google::protobuf::internal::ThreadSafeArena::thread_cache_'
This is caused by missing the `PROTOBUF_USE_DLLS` define required for libprotobuf as a shared library.
See https://github.com/pytorch/pytorch/issues/106297

Author: Alexander Grund (TU Dresden)

diff --git a/caffe2/proto/CMakeLists.txt b/caffe2/proto/CMakeLists.txt
index ba6b696dde4..5033e228119 100644
--- a/caffe2/proto/CMakeLists.txt
+++ b/caffe2/proto/CMakeLists.txt
@@ -7,6 +7,7 @@ endif()
caffe2_protobuf_generate_cpp_py(Caffe2_PROTO_SRCS Caffe2_PROTO_HEADERS Caffe2_PROTO_PY ${Caffe2_PROTOBUF_FILES})

add_library(Caffe2_PROTO OBJECT ${Caffe2_PROTO_HEADERS} ${Caffe2_PROTO_SRCS})
+target_link_libraries(Caffe2_PROTO PRIVATE protobuf::libprotobuf)

if(MSVC)
if(BUILD_SHARED_LIBS)
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 8d7633c4ab0..fb0e9a0c5f7 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -122,10 +122,6 @@ if((NOT TARGET protobuf::libprotobuf) AND (NOT TARGET protobuf::libprotobuf-lite
# "Please set the proper paths so that I can find protobuf correctly.")
endif()

-get_target_property(__tmp protobuf::libprotobuf INTERFACE_INCLUDE_DIRECTORIES)
-message(STATUS "Caffe2 protobuf include directory: " ${__tmp})
-include_directories(BEFORE SYSTEM ${__tmp})
-
# If Protobuf_VERSION is known (true in most cases, false if we are building
# local protobuf), then we will add a protobuf version check in
# Caffe2Config.cmake.in.
Loading