diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-XNNPACK-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-XNNPACK-tests.patch new file mode 100644 index 000000000000..c9a82a56164b --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-XNNPACK-tests.patch @@ -0,0 +1,236 @@ +When XNNPACK is disabled tests run into assertion failures. +Fix by using (most of) https://github.com/pytorch/pytorch/pull/72642 + +Author: Digant Desai + +Backported by Alexander Grund (TU Dresden) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 0c11507838..5b74b7f63c 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -275,7 +275,14 @@ option(USE_LITE_INTERPRETER_PROFILER "Enable " ON) + option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF) + option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF) + option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation as opposed to build-time (needs libshaderc)" OFF) +-option(USE_XNNPACK "Use XNNPACK" ON) ++# option USE_XNNPACK: try to enable xnnpack by default. ++set(XNNPACK_MIN_CMAKE_VER 3.12) ++cmake_dependent_option( ++ USE_XNNPACK "Use XNNPACK. Requires cmake >= ${XNNPACK_MIN_CMAKE_VER}." ON ++ "CMAKE_VERSION VERSION_GREATER_EQUAL ${XNNPACK_MIN_CMAKE_VER}" OFF) ++if(NOT USE_XNNPACK AND CMAKE_VERSION VERSION_LESS ${XNNPACK_MIN_CMAKE_VER}) ++ message(WARNING "USE_XNNPACK is set to OFF. XNNPACK requires CMake version ${XNNPACK_MIN_CMAKE_VER} or greater.") ++endif() + option(USE_ZMQ "Use ZMQ" OFF) + option(USE_ZSTD "Use ZSTD" OFF) + # Ensure that an MKLDNN build is the default for x86 CPUs +diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake +index 6d021536c1..7c1ac64539 100644 +--- a/cmake/Summary.cmake ++++ b/cmake/Summary.cmake +@@ -165,6 +165,7 @@ function(caffe2_print_configuration_summary) + message(STATUS " USE_PROF : ${USE_PROF}") + message(STATUS " USE_QNNPACK : ${USE_QNNPACK}") + message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}") ++ message(STATUS " USE_XNNPACK : ${USE_XNNPACK}") + message(STATUS " USE_REDIS : ${USE_REDIS}") + message(STATUS " USE_ROCKSDB : ${USE_ROCKSDB}") + message(STATUS " USE_ZMQ : ${USE_ZMQ}") +diff --git a/test/jit/test_optimize_for_mobile_preserve_debug_info.py b/test/jit/test_optimize_for_mobile_preserve_debug_info.py +index c08f3b5838..1b93f54c15 100644 +--- a/test/jit/test_optimize_for_mobile_preserve_debug_info.py ++++ b/test/jit/test_optimize_for_mobile_preserve_debug_info.py +@@ -1,8 +1,8 @@ + import torch + import torch._C +-import torch.backends.xnnpack + import torch.nn.functional as F + from torch.testing._internal.jit_utils import JitTestCase ++from torch.testing._internal.common_utils import skipIfNoXNNPACK + + class TestOptimizeForMobilePreserveDebugInfo(JitTestCase): + def check_replacement( +@@ -34,6 +34,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase): + original_source_ranges[replacements[node.kind()]], + ) + ++ @skipIfNoXNNPACK + def test_replace_conv1d_with_conv2d(self): + class TestConv1d(torch.nn.Module): + def __init__(self, weight, bias): +@@ -61,6 +62,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase): + jit_pass=torch._C._jit_pass_transform_conv1d_to_conv2d, + ) + ++ @skipIfNoXNNPACK + def test_insert_pre_packed_linear_before_inline_and_conv_2d_op(self): + class TestPrepackedLinearBeforeInlineAndConv2dOp(torch.nn.Module): + def __init__( +@@ -137,6 +139,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase): + jit_pass=torch._C._jit_pass_insert_prepacked_ops, + ) + ++ @skipIfNoXNNPACK + def test_insert_pre_packed_linear_op(self): + self.check_replacement( + model=torch.jit.trace(torch.nn.Linear(5, 4), torch.rand(3, 2, 5)), +@@ -228,6 +231,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase): + jit_pass=torch._C._jit_pass_fuse_clamp_w_prepacked_linear_conv, + ) + ++ @skipIfNoXNNPACK + def test_fuse_activation_with_pack_ops_linear_conv2d_1(self): + self.run_test_fuse_activation_with_pack_ops_linear_conv2d( + linear_activation=F.hardtanh, +@@ -236,6 +240,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase): + conv2d_activation_kind="aten::hardtanh_" + ) + ++ @skipIfNoXNNPACK + def test_fuse_activation_with_pack_ops_linear_conv2d_2(self): + self.run_test_fuse_activation_with_pack_ops_linear_conv2d( + linear_activation=F.hardtanh_, +@@ -244,6 +249,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase): + conv2d_activation_kind="aten::hardtanh" + ) + ++ @skipIfNoXNNPACK + def test_fuse_activation_with_pack_ops_linear_conv2d_3(self): + self.run_test_fuse_activation_with_pack_ops_linear_conv2d( + linear_activation=F.relu, +@@ -252,6 +258,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase): + conv2d_activation_kind="aten::relu_" + ) + ++ @skipIfNoXNNPACK + def test_fuse_activation_with_pack_ops_linear_conv2d_4(self): + self.run_test_fuse_activation_with_pack_ops_linear_conv2d( + linear_activation=F.relu_, +diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py +index 19f07e2454..21ae5d3ee9 100644 +--- a/test/test_mobile_optimizer.py ++++ b/test/test_mobile_optimizer.py +@@ -1,9 +1,8 @@ + import unittest + import torch + import torch.nn as nn +-import torch.backends.xnnpack + import torch.utils.bundled_inputs +-from torch.testing._internal.common_utils import TestCase, run_tests ++from torch.testing._internal.common_utils import TestCase, run_tests, skipIfNoXNNPACK + from torch.testing._internal.jit_utils import get_forward, get_forward_graph + from torch.utils.mobile_optimizer import (LintCode, + generate_mobile_module_lints, +@@ -22,9 +21,7 @@ FileCheck = torch._C.FileCheck + + class TestOptimizer(TestCase): + +- @unittest.skipUnless(torch.backends.xnnpack.enabled, +- " XNNPACK must be enabled for these tests." +- " Please build with USE_XNNPACK=1.") ++ @skipIfNoXNNPACK + def test_optimize_for_mobile(self): + batch_size = 2 + input_channels_per_group = 6 +@@ -263,9 +260,7 @@ class TestOptimizer(TestCase): + rtol=1e-2, + atol=1e-3) + +- @unittest.skipUnless(torch.backends.xnnpack.enabled, +- " XNNPACK must be enabled for these tests." +- " Please build with USE_XNNPACK=1.") ++ @skipIfNoXNNPACK + def test_quantized_conv_no_asan_failures(self): + # There were ASAN failures when fold_conv_bn was run on + # already quantized conv modules. Verifying that this does +@@ -359,6 +354,7 @@ class TestOptimizer(TestCase): + bi_module_lint_list = generate_mobile_module_lints(bi_module) + self.assertEqual(len(bi_module_lint_list), 0) + ++ @skipIfNoXNNPACK + def test_preserve_bundled_inputs_methods(self): + class MyBundledInputModule(torch.nn.Module): + def __init__(self): +@@ -413,9 +409,7 @@ class TestOptimizer(TestCase): + incomplete_bi_module_optim = optimize_for_mobile(incomplete_bi_module, preserved_methods=['get_all_bundled_inputs']) + self.assertTrue(hasattr(incomplete_bi_module_optim, 'get_all_bundled_inputs')) + +- @unittest.skipUnless(torch.backends.xnnpack.enabled, +- " XNNPACK must be enabled for these tests." +- " Please build with USE_XNNPACK=1.") ++ @skipIfNoXNNPACK + def test_hoist_conv_packed_params(self): + + if 'qnnpack' not in torch.backends.quantized.supported_engines: +@@ -509,6 +503,7 @@ class TestOptimizer(TestCase): + m_optim_res = m_optim(data) + torch.testing.assert_close(m_res, m_optim_res, rtol=1e-2, atol=1e-3) + ++ @skipIfNoXNNPACK + @unittest.skipUnless(HAS_TORCHVISION, "Needs torchvision") + def test_mobilenet_optimize_for_mobile(self): + m = torchvision.models.mobilenet_v3_small() +diff --git a/test/test_model_dump.py b/test/test_model_dump.py +index 417bb2a91a..e4a9ffa1bd 100644 +--- a/test/test_model_dump.py ++++ b/test/test_model_dump.py +@@ -8,9 +8,10 @@ import urllib + import unittest + + import torch ++import torch.backends.xnnpack + import torch.utils.model_dump + import torch.utils.mobile_optimizer +-from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS ++from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS, skipIfNoXNNPACK + from torch.testing._internal.common_quantized import supported_qengines + + +@@ -170,6 +171,7 @@ class TestModelDump(TestCase): + qmodel = self.get_quant_model() + self.do_dump_model(torch.jit.script(qmodel)) + ++ @skipIfNoXNNPACK + @unittest.skipUnless("qnnpack" in supported_qengines, "QNNPACK not available") + def test_optimized_quantized_model(self): + qmodel = self.get_quant_model() +diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py +index a0f8328ec6..a737ed4b8b 100644 +--- a/test/test_xnnpack_integration.py ++++ b/test/test_xnnpack_integration.py +@@ -51,7 +51,6 @@ class TestXNNPACKOps(TestCase): + output_linearprepacked = torch.ops.prepacked.linear_clamp_run(input_data, packed_weight_bias) + torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3) + +- + @given(batch_size=st.integers(0, 3), + input_channels_per_group=st.integers(1, 32), + height=st.integers(5, 64), +diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py +index b4ef79620d..18d8925d6a 100644 +--- a/torch/testing/_internal/common_utils.py ++++ b/torch/testing/_internal/common_utils.py +@@ -56,6 +56,7 @@ from torch._six import string_classes + from torch import Tensor + import torch.backends.cudnn + import torch.backends.mkl ++import torch.backends.xnnpack + from enum import Enum + + torch.backends.disable_global_flags() +@@ -908,6 +909,14 @@ def _test_function(fn, device): + return fn(self, device) + return run_test_function + ++def skipIfNoXNNPACK(fn): ++ @wraps(fn) ++ def wrapper(*args, **kwargs): ++ if not torch.backends.xnnpack.enabled: ++ raise unittest.SkipTest('XNNPACK must be enabled for these tests. Please build with USE_XNNPACK=1.') ++ else: ++ fn(*args, **kwargs) ++ return wrapper + + def skipIfNoLapack(fn): + @wraps(fn) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-kineto-crash.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-kineto-crash.patch new file mode 100644 index 000000000000..7858aa1f8cf8 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-kineto-crash.patch @@ -0,0 +1,24 @@ +Fix a crash during application shutdown visible in test_profiler on some machines. +See https://github.com/pytorch/kineto/pull/642 + +Author: Alexander Grund (TU Dresden) + +diff -aur a/third_party/kineto/libkineto/src/EventProfilerController.cpp b/third_party/kineto/libkineto/src/EventProfilerController.cpp +--- a/third_party/kineto/libkineto/src/EventProfilerController.cpp 2022-08-05 13:10:46.175716618 +0200 ++++ b/third_party/kineto/libkineto/src/EventProfilerController.cpp 2022-08-05 13:16:00.654118490 +0200 +@@ -233,9 +233,14 @@ EventProfilerController::~EventProfilerController() { + + // Must be called under lock + void EventProfilerController::start(CUcontext ctx, ConfigLoader& configLoader) { +- profilerMap()[ctx] = unique_ptr( ++ // Avoid static initialization order fiasco: ++ // We need the profilerMap and with it all controllers to be destroyed ++ // before everything the controller accesses gets destroyed. ++ // Hence access the profilerMap after initialization of the controller. ++ auto controller = unique_ptr( + new EventProfilerController( + ctx, configLoader, detail::HeartbeatMonitor::instance())); ++ profilerMap()[ctx] = std::move(controller); + } + + // Must be called under lock diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-test-model_dump.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-test-model_dump.patch new file mode 100644 index 000000000000..1891c93e93ff --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-test-model_dump.patch @@ -0,0 +1,36 @@ +From https://github.com/pytorch/pytorch/pull/84744 + +Author: Alexander Grund (TU Dresden) + +From 4d99f72dcf71bffa7ff4275750cb86d872ac653f Mon Sep 17 00:00:00 2001 +From: Alexander Grund +Date: Fri, 9 Sep 2022 11:10:05 +0200 +Subject: [PATCH] Fix failing test_model_dump due to empty file + +The `torch.jit.save` call on a file object may not actually write the +data to disk due to buffering. The call to `model_dump.main` on that +file will when fail with an error like + +> zipfile.BadZipFile: File is not a zip file + +Inspecting the file confirms that it is either empty (usually) or +incomplete (possible). + +Fix this by flushing the file after saving the model. +--- + test/test_model_dump.py | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/test/test_model_dump.py b/test/test_model_dump.py +index a8add0e2cd923..3c682b6ce6805 100644 +--- a/test/test_model_dump.py ++++ b/test/test_model_dump.py +@@ -131,6 +131,8 @@ def test_main(self): + + with tempfile.NamedTemporaryFile() as tf: + torch.jit.save(torch.jit.script(SimpleModel()), tf) ++ # Actually write contents to disk so we can read it below ++ tf.flush() + + stdout = io.StringIO() + torch.utils.model_dump.main( diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-vsx-vector-functions.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-vsx-vector-functions.patch new file mode 100644 index 000000000000..7d35ac8d1344 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-vsx-vector-functions.patch @@ -0,0 +1,39 @@ +This fixes the remaining bug introduced by the VSX optimized code in https://github.com/pytorch/pytorch/pull/41541 + +See https://github.com/pytorch/pytorch/pull/59382 + +Author: Alexander Grund (TU Dresden) + +diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h +index 2427276bce..2b46e0a662 100644 +--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h ++++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h +@@ -558,27 +558,7 @@ class Vectorized { + } + + Vectorized C10_ALWAYS_INLINE pow(const Vectorized& exp) const { +- auto x = *this; +- auto sign_bit = (*this) & sign_mask; +- // |b| +- auto exp_abs = exp.abs(); +- auto exp_trunc = exp.trunc(); +- Vectorized odd_mask; +- odd_mask._vecb0 = (vec_signed(exp._vec0) & vi_1) != vi_0; +- odd_mask._vecb1 = (vec_signed(exp._vec1) & vi_1) != vi_0; +- // using ln fuction +- auto temp = (abs().log() * exp).exp(); +- +- // is odd or even check from Sleef +- auto is_int = (exp == exp_trunc) | (exp_abs >= vcheck); +- auto is_odd = odd_mask & is_int & (exp_abs < vcheck); +- // if even then then pow result should be absolute +- auto temp_sign = temp | sign_bit; // copy_sign +- auto out = blendv(temp, temp_sign, is_odd); +- // x<0 and y != N, then NAN +- auto out1 = blendv(out, v_nan, ((exp.floor() != exp) & (x < zero))); +- // y = 0 then 1 +- return blendv(out1, one, (exp_abs == zero)); ++ return {Sleef_powf4_u10vsx(_vec0, exp._vec0), Sleef_powf4_u10vsx(_vec1, exp._vec1)}; + } + + Vectorized fmod(const Vectorized& b) const { diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch new file mode 100644 index 000000000000..ce5acf4a8844 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch @@ -0,0 +1,30 @@ +When QNNPACK is not available the test will fail trying to use it. +Skip them conditionally. +See https://github.com/pytorch/pytorch/pull/82882 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_nnapi.py b/test/test_nnapi.py +index f8db7e1a3d..1d4fd52e28 100644 +--- a/test/test_nnapi.py ++++ b/test/test_nnapi.py +@@ -2,8 +2,10 @@ + import os + import ctypes + import torch ++import unittest + from typing import Tuple + from torch.backends._nnapi.prepare import convert_model_to_nnapi ++from torch.testing._internal.common_quantized import supported_qengines + from torch.testing._internal.common_utils import TestCase, run_tests + + +@@ -18,6 +20,8 @@ def nhwc(t): + return t + + ++@unittest.skipUnless('qnnpack' in supported_qengines, ++ "This Pytorch Build has not been built with or does not support QNNPACK") + class TestNNAPI(TestCase): + + def setUp(self): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb index 5aba12b39575..8385d91b7f0a 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb @@ -8,51 +8,75 @@ PyTorch is a deep learning framework that puts Python first.""" toolchain = {'name': 'foss', 'version': '2021a'} -sources = [{ - 'filename': '%(name)s-%(version)s.tar.gz', - 'git_config': { - 'url': 'https://github.com/pytorch', - 'repo_name': 'pytorch', - 'tag': 'v%(version)s', - 'recursive': True, - }, -}] +source_urls = ['https://github.com/%(github_account)s/%(namelower)s/releases/download/v%(version)s'] +sources = ['%(namelower)s-v%(version)s.tar.gz'] + patches = [ 'PyTorch-1.7.0_avoid-nan-in-test-torch.patch', 'PyTorch-1.7.0_disable-dev-shm-test.patch', 'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch', - 'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch', + 'PyTorch-1.10.0_fix-kineto-crash.patch', 'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch', + 'PyTorch-1.10.0_fix-test-model_dump.patch', + 'PyTorch-1.10.0_fix-vsx-vector-functions.patch', + 'PyTorch-1.10.0_fix-XNNPACK-tests.patch', 'PyTorch-1.10.0_skip_cmake_rpath.patch', - 'PyTorch-1.11.0_increase-distributed-test-timeout.patch', - 'PyTorch-1.11.0_skip_failing_ops_tests.patch', - 'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch', + 'PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch', + 'PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch', + 'PyTorch-1.11.0_fix-attention_cpp-compilation.patch', + 'PyTorch-1.11.0_fix-fsdp-fp16-test.patch', 'PyTorch-1.11.0_fix_sharded_imports.patch', - 'PyTorch-1.11.0_increase_test_tolerances_TF32.patch', + 'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch', + 'PyTorch-1.11.0_fix-test_utils.patch', 'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch', - 'PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch', + 'PyTorch-1.11.0_increase-distributed-test-timeout.patch', + 'PyTorch-1.11.0_increase_test_tolerances_TF32.patch', + 'PyTorch-1.11.0_increase-tolerance-test_ops.patch', + 'PyTorch-1.11.0_install-vsx-vec-headers.patch', + 'PyTorch-1.11.0_skip_failing_ops_tests.patch', + 'PyTorch-1.11.1_skip-test_init_from_local_shards.patch', + 'PyTorch-1.11.1_skip-test_sibling_fusion.patch', ] checksums = [ - None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone' + 'dc0c2b8d13c112a2b9ea8757a475b0ce2ca97cd19c50a8b70b8c286676616f1d', # pytorch-v1.11.0.tar.gz 'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch '89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch - # PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch - 'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea', + # PyTorch-1.10.0_fix-kineto-crash.patch + 'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb', # PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch '313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707', + # PyTorch-1.10.0_fix-test-model_dump.patch + '339148ae1a028cda6e750ac93fa38a599f66c7abe26586c9219f1a206ea14557', + # PyTorch-1.10.0_fix-vsx-vector-functions.patch + '7bef5f96cb83b2d655d2f76dd7468a171d446f0b3e06da2232ec7f886484d312', + # PyTorch-1.10.0_fix-XNNPACK-tests.patch + 'd3e749a2a42efce463e3b8a1aebb21f0edf2256682c4417297d9a44a6210e5f8', 'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch + # PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch + '34ba476a7bcddec323bf9eca083cb4623d0f569d081aa3add3769c24f22849d2', + # PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch + 'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51', + # PyTorch-1.11.0_fix-attention_cpp-compilation.patch + '84214fcc7e30cf70659a7c3bd70bf11e73d58fd4f7fff2c233e3225619b0e42c', + 'bb1c4e6d6fd4b0cf57ff8b824c797331b533bb1ffc63f5db0bae3aee10c3dc13', # PyTorch-1.11.0_fix-fsdp-fp16-test.patch + '9a04f4285b800dad8a00c3014af0a9713d40d5dd35d10931c7c0da4e89c558e9', # PyTorch-1.11.0_fix_sharded_imports.patch + '21fc678febcdfbb9dabd72235be23cd392044e9a954f6580d15b530e1f69dcc1', # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch + '4f7e25c4e2eb7094f92607df74488c6a4a35849fabf05fcf6c3655fa3f44a861', # PyTorch-1.11.0_fix-test_utils.patch + # PyTorch-1.11.0_increase_c10d_gloo_timeout.patch + '20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953', # PyTorch-1.11.0_increase-distributed-test-timeout.patch '087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f', - '8eaca92d64fcadb0552d28e9c7ea5c4bc669d2fe33004e45a3519ce8d0d136a2', # PyTorch-1.11.0_skip_failing_ops_tests.patch - '21fc678febcdfbb9dabd72235be23cd392044e9a954f6580d15b530e1f69dcc1', # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch - '9a04f4285b800dad8a00c3014af0a9713d40d5dd35d10931c7c0da4e89c558e9', # PyTorch-1.11.0_fix_sharded_imports.patch # PyTorch-1.11.0_increase_test_tolerances_TF32.patch '26e179a4f6f57e49209092612ae5f5cd8c03fd2ca84566ba0244eabefc3736ba', - # PyTorch-1.11.0_increase_c10d_gloo_timeout.patch - '20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953', - # PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch - 'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51', + # PyTorch-1.11.0_increase-tolerance-test_ops.patch + 'ceec745c68a405bba79efb4dc61c662ca84eb950cd0163c7104330f4bf614cf5', + 'f2e6b9625733d9a471bb75e1ea20e28814cf1380b4f9089aa838ee35ddecf07d', # PyTorch-1.11.0_install-vsx-vec-headers.patch + '8eaca92d64fcadb0552d28e9c7ea5c4bc669d2fe33004e45a3519ce8d0d136a2', # PyTorch-1.11.0_skip_failing_ops_tests.patch + # PyTorch-1.11.1_skip-test_init_from_local_shards.patch + '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7', + # PyTorch-1.11.1_skip-test_sibling_fusion.patch + '3d6f6395d98e8e4ad76b0b63c625fddf082cf7f066eb97d4d82401f96dab2555', ] osdependencies = [OS_PKG_IBVERBS_DEV] @@ -93,16 +117,8 @@ excluded_tests = { # Bad tests: https://github.com/pytorch/pytorch/issues/60260 'distributed/elastic/utils/distributed_test', 'distributed/elastic/multiprocessing/api_test', - # These tests fail on A10s at the very least, they time out forever no matter how long the timeout is. - # Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html - # 'distributed/test_distributed_fork', - 'distributed/test_distributed_spawn', # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079 'test_optim', - # Test from this suite timeout often. The process group backend is deprecated anyway - # 'distributed/rpc/test_process_group_agent', - # This test fails constently when run as part of the test suite, but succeeds when run interactively - 'test_model_dump', # These tests appear flaky, possibly related to number of GPUs that are used 'distributed/fsdp/test_fsdp_memory', 'distributed/fsdp/test_fsdp_overlap', @@ -112,10 +128,10 @@ excluded_tests = { runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s' # several tests are known to be flaky, and fail in some contexts (like having multiple GPUs available), -# so we allow up to 10 (out of ~90k) tests to fail before treating the installation to be faulty -max_failed_tests = 10 +# so we allow some (out of ~90k) tests to fail before treating the installation to be faulty +max_failed_tests = 1 -# The readelf sanity check command can be taken out once the TestRPATH test from +# The readelf sanity check command can be taken out once the TestRPATH test from # https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT sanity_check_commands = [ diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-attention_cpp-compilation.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-attention_cpp-compilation.patch new file mode 100644 index 000000000000..751e954f75fb --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-attention_cpp-compilation.patch @@ -0,0 +1,61 @@ +Fix failure to compile attention.cpp on some platforms, e.g. POWER. +I.e. those 3 errors: + +../aten/src/ATen/native/attention.cpp:145:46: error: call to deleted function 'double& at::vec::CPU_CAPABILITY::Vectorized::operator[](int)' + 145 | hmax = std::max(max_input[i], hmax); +[snip] +../aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h:190:11: note: declared here + 190 | double& operator[](int idx) = delete; + +../aten/src/ATen/native/attention.cpp:152:66: error: call to deleted function 'double& at::vec::CPU_CAPABILITY::Vectorized::operator[](int)' + 152 | hsum += std::exp(static_cast(v[i]) - hmax); + +../aten/src/ATen/native/attention.cpp:168:100: error: call to deleted function 'double& at::vec::CPU_CAPABILITY::Vectorized::operator[](int)' + 168 | input_data[t + i] = static_cast(std::exp(static_cast(v[i]) - hmax) * inv_denominator); + +Author: Alexander Grund (TU Dresden) + +diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp +index 188de5e2cd..7f3e8e5af2 100644 +--- a/aten/src/ATen/native/attention.cpp ++++ b/aten/src/ATen/native/attention.cpp +@@ -140,22 +140,18 @@ void masked_softmax_dropout( + max_input = vec::maximum(max_input, v); + } + ++ __at_align__ scalar_t arr_max_input[V]; ++ max_input.store(arr_max_input); + auto hmax = std::numeric_limits::lowest(); + for (auto i = 0; i < V; ++i) { +- hmax = std::max(max_input[i], hmax); ++ hmax = std::max(arr_max_input[i], hmax); + } + accscalar_t hsum = 0; +- for (auto t = 0; t < T; t += V) { +- auto v = Vec::loadu(&input_data[t]); +- // TODO: vectorize in accscalar_t? +- for (auto i = 0; i < V; ++i) { +- hsum += std::exp(static_cast(v[i]) - hmax); +- } ++ for (auto t = 0; t < T; ++t) { ++ hsum += std::exp(static_cast(input_data[t]) - hmax); + } + auto inv_denominator = 1.0 / hsum; +- for (auto t = 0; t < T; t += V) { +- Vec v = Vec::loadu(&input_data[t]); +- ++ for (auto t = 0; t < T; ++t) { + // TODO: vectorize in accscalar_t? + // TODO this faster solution does not work on Android build + /* +@@ -164,9 +160,7 @@ void masked_softmax_dropout( + } + v.store(&input_data[t]); + */ +- for (auto i = 0; i < V; ++i) { +- input_data[t + i] = static_cast(std::exp(static_cast(v[i]) - hmax) * inv_denominator); +- } ++ input_data[t] = static_cast(std::exp(static_cast(input_data[t]) - hmax) * inv_denominator); + } + } + }); diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fsdp-fp16-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fsdp-fp16-test.patch new file mode 100644 index 000000000000..dc9159e5cf92 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fsdp-fp16-test.patch @@ -0,0 +1,35 @@ +The test fails on a node with more than 5 V100 GPUs or more than 4 A100 GPUs. +Hence limit the world_size to 4 +See https://github.com/pytorch/pytorch/issues/78975 + +Author: Alexander Grund (TU Dresden) + +diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py +index ff0fbb69e3..6bf3e71fde 100644 +--- a/torch/testing/_internal/common_fsdp.py ++++ b/torch/testing/_internal/common_fsdp.py +@@ -324,7 +324,7 @@ class FSDPTest(MultiProcessTestCase): + + @property + def world_size(self): +- return torch.cuda.device_count() if torch.cuda.is_available() else 4 ++ return min(4, torch.cuda.device_count()) if torch.cuda.is_available() else 4 + + @property + def init_method(self): +diff --git a/test/distributed/fsdp/test_fsdp_pure_fp16.py b/test/distributed/fsdp/test_fsdp_pure_fp16.py +index eea03bea3d..d3a4bb8257 100644 +--- a/test/distributed/fsdp/test_fsdp_pure_fp16.py ++++ b/test/distributed/fsdp/test_fsdp_pure_fp16.py +@@ -32,6 +32,11 @@ if TEST_WITH_DEV_DBG_ASAN: + + class TestPureFP16(FSDPTest): + ++ @property ++ def world_size(self): ++ # Test fails due to inaccuracies when using more than 4 GPUs ++ return min(4, super().world_size) ++ + @skip_if_lt_x_gpu(2) + @parametrize( + "cpu_offload", diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-test_utils.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-test_utils.patch new file mode 100644 index 000000000000..a42166c927c1 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-test_utils.patch @@ -0,0 +1,20 @@ +The function tested is dependent on the current locale and may fail on non-English systems. +See https://github.com/pytorch/pytorch/pull/85891 + +Author: Alexander Grund (TU Dresden) + +diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py +index 9c6635f111..d99aae1ca2 100644 +--- a/torch/utils/cpp_extension.py ++++ b/torch/utils/cpp_extension.py +@@ -292,7 +292,9 @@ def check_compiler_ok_for_platform(compiler: str) -> bool: + if any(name in compiler_path for name in _accepted_compilers_for_platform()): + return True + # If compiler wrapper is used try to infer the actual compiler by invoking it with -v flag +- version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT).decode(*SUBPROCESS_DECODE_ARGS) ++ env = os.environ.copy() ++ env['LC_ALL'] = 'C' # Don't localize output ++ version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS) + if IS_LINUX: + # Check for 'gcc' or 'g++' for sccache warpper + pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_increase-tolerance-test_ops.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_increase-tolerance-test_ops.patch new file mode 100644 index 000000000000..69d58f7a3349 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_increase-tolerance-test_ops.patch @@ -0,0 +1,22 @@ +`test_out` may fail due to slightly different values caused by different order of matrizes in SGEMM: + +> Mismatched elements: 1 / 50 (2.0%) +> Greatest absolute difference: 1.430511474609375e-05 at index (4, 5) (up to 1e-05 allowed) +> Greatest relative difference: 4.65393206065873e-06 at index (4, 5) (up to 1.3e-06 allowed) + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_ops.py b/test/test_ops.py +index 291361d53d..2e0597376d 100644 +--- a/test/test_ops.py ++++ b/test/test_ops.py +@@ -418,6 +418,9 @@ class TestCommon(TestCase): + self.skipTest("Skipped! Op has not supported dtypes on this device.") + dtype = torch.float32 if torch.float32 in supported_dtypes else list(supported_dtypes)[0] + ++ if dtype is torch.float32: ++ self.precision, self.rel_tol = (1.5e-05, 1e-05) ++ + # NOTE: only tests on first sample + samples = op.sample_inputs(device, dtype) + sample = first_sample(self, samples) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_install-vsx-vec-headers.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_install-vsx-vec-headers.patch new file mode 100644 index 000000000000..628a59ec0a2e --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_install-vsx-vec-headers.patch @@ -0,0 +1,38 @@ +Add missing headers to the installation which fixes e.g. test_cpp_extensions_aot_ninja +See https://github.com/pytorch/pytorch/pull/85547 + +Author: Alexander Grund (TU Dresden) + +diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt +index 5710eca27c..0cd15c2ad2 100644 +--- a/aten/src/ATen/CMakeLists.txt ++++ b/aten/src/ATen/CMakeLists.txt +@@ -56,7 +56,7 @@ if(NOT BUILD_CAFFE2 AND NOT BUILD_LITE_INTERPRETER) + EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS}) + endif() + +-file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/*.h" "quantized/*.h") ++file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/*.h" "quantized/*.h") + file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp") + file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh") + file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp") +diff --git a/setup.py b/setup.py +index f1ebab7391..92de0367ac 100644 +--- a/setup.py ++++ b/setup.py +@@ -942,6 +942,7 @@ if __name__ == '__main__': + 'include/ATen/*.h', + 'include/ATen/cpu/*.h', + 'include/ATen/cpu/vec/vec256/*.h', ++ 'include/ATen/cpu/vec/vec256/vsx/*.h', + 'include/ATen/cpu/vec/vec512/*.h', + 'include/ATen/cpu/vec/*.h', + 'include/ATen/core/*.h', +@@ -1043,6 +1044,7 @@ if __name__ == '__main__': + 'include/THH/*.cuh', + 'include/THH/*.h*', + 'include/THH/generic/*.h', ++ 'include/sleef.h', + 'share/cmake/ATen/*.cmake', + 'share/cmake/Caffe2/*.cmake', + 'share/cmake/Caffe2/public/*.cmake', diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_init_from_local_shards.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_init_from_local_shards.patch new file mode 100644 index 000000000000..c63d7193b7ec --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_init_from_local_shards.patch @@ -0,0 +1,25 @@ +The test often times out and seems to be considered flaky by PyTorch: +https://github.com/pytorch/pytorch/issues/78068 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py +index cbad9458ae..72a36e0e9a 100644 +--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py ++++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py +@@ -6,6 +6,7 @@ import io + import itertools + import pickle + import sys ++from unittest import skip + import torch + import torch.distributed as dist + from torch.distributed import rpc +@@ -1817,6 +1818,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase): + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() ++ @skip("Times out often") + def test_init_from_local_shards(self): + local_shard_metadata = ShardMetadata( + shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5], diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_sibling_fusion.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_sibling_fusion.patch new file mode 100644 index 000000000000..6845706f4848 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_sibling_fusion.patch @@ -0,0 +1,17 @@ +The test fails on pre-volta devices as fusing the at::sum/at::add +is skipped if the device is not of compute capability >= 7 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py +index c03ff0b311..feb6e281f9 100644 +--- a/test/test_jit_cuda_fuser.py ++++ b/test/test_jit_cuda_fuser.py +@@ -3053,6 +3054,7 @@ class TestCudaFuser(JitTestCase): + self.assertGraphContainsExactly(graph, op, 0) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_sibling_fusion(self):