diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-XNNPACK-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-XNNPACK-tests.patch
new file mode 100644
index 000000000000..c9a82a56164b
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-XNNPACK-tests.patch
@@ -0,0 +1,236 @@
+When XNNPACK is disabled tests run into assertion failures.
+Fix by using (most of) https://github.com/pytorch/pytorch/pull/72642
+
+Author: Digant Desai <digantdesai@fb.com>
+
+Backported by Alexander Grund (TU Dresden)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 0c11507838..5b74b7f63c 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -275,7 +275,14 @@ option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
+ option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
+ option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
+ option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation as opposed to build-time (needs libshaderc)" OFF)
+-option(USE_XNNPACK "Use XNNPACK" ON)
++# option USE_XNNPACK: try to enable xnnpack by default.
++set(XNNPACK_MIN_CMAKE_VER 3.12)
++cmake_dependent_option(
++    USE_XNNPACK "Use XNNPACK. Requires cmake >= ${XNNPACK_MIN_CMAKE_VER}." ON
++    "CMAKE_VERSION VERSION_GREATER_EQUAL ${XNNPACK_MIN_CMAKE_VER}" OFF)
++if(NOT USE_XNNPACK AND CMAKE_VERSION VERSION_LESS ${XNNPACK_MIN_CMAKE_VER})
++  message(WARNING "USE_XNNPACK is set to OFF. XNNPACK requires CMake version ${XNNPACK_MIN_CMAKE_VER} or greater.")
++endif()
+ option(USE_ZMQ "Use ZMQ" OFF)
+ option(USE_ZSTD "Use ZSTD" OFF)
+ # Ensure that an MKLDNN build is the default for x86 CPUs
+diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
+index 6d021536c1..7c1ac64539 100644
+--- a/cmake/Summary.cmake
++++ b/cmake/Summary.cmake
+@@ -165,6 +165,7 @@ function(caffe2_print_configuration_summary)
+   message(STATUS "  USE_PROF              : ${USE_PROF}")
+   message(STATUS "  USE_QNNPACK           : ${USE_QNNPACK}")
+   message(STATUS "  USE_PYTORCH_QNNPACK   : ${USE_PYTORCH_QNNPACK}")
++  message(STATUS "  USE_XNNPACK           : ${USE_XNNPACK}")
+   message(STATUS "  USE_REDIS             : ${USE_REDIS}")
+   message(STATUS "  USE_ROCKSDB           : ${USE_ROCKSDB}")
+   message(STATUS "  USE_ZMQ               : ${USE_ZMQ}")
+diff --git a/test/jit/test_optimize_for_mobile_preserve_debug_info.py b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
+index c08f3b5838..1b93f54c15 100644
+--- a/test/jit/test_optimize_for_mobile_preserve_debug_info.py
++++ b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
+@@ -1,8 +1,8 @@
+ import torch
+ import torch._C
+-import torch.backends.xnnpack
+ import torch.nn.functional as F
+ from torch.testing._internal.jit_utils import JitTestCase
++from torch.testing._internal.common_utils import skipIfNoXNNPACK
+ 
+ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase):
+     def check_replacement(
+@@ -34,6 +34,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase):
+                     original_source_ranges[replacements[node.kind()]],
+                 )
+ 
++    @skipIfNoXNNPACK
+     def test_replace_conv1d_with_conv2d(self):
+         class TestConv1d(torch.nn.Module):
+             def __init__(self, weight, bias):
+@@ -61,6 +62,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase):
+             jit_pass=torch._C._jit_pass_transform_conv1d_to_conv2d,
+         )
+ 
++    @skipIfNoXNNPACK
+     def test_insert_pre_packed_linear_before_inline_and_conv_2d_op(self):
+         class TestPrepackedLinearBeforeInlineAndConv2dOp(torch.nn.Module):
+             def __init__(
+@@ -137,6 +139,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase):
+             jit_pass=torch._C._jit_pass_insert_prepacked_ops,
+         )
+ 
++    @skipIfNoXNNPACK
+     def test_insert_pre_packed_linear_op(self):
+         self.check_replacement(
+             model=torch.jit.trace(torch.nn.Linear(5, 4), torch.rand(3, 2, 5)),
+@@ -228,6 +231,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase):
+             jit_pass=torch._C._jit_pass_fuse_clamp_w_prepacked_linear_conv,
+         )
+ 
++    @skipIfNoXNNPACK
+     def test_fuse_activation_with_pack_ops_linear_conv2d_1(self):
+         self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
+             linear_activation=F.hardtanh,
+@@ -236,6 +240,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase):
+             conv2d_activation_kind="aten::hardtanh_"
+         )
+ 
++    @skipIfNoXNNPACK
+     def test_fuse_activation_with_pack_ops_linear_conv2d_2(self):
+         self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
+             linear_activation=F.hardtanh_,
+@@ -244,6 +249,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase):
+             conv2d_activation_kind="aten::hardtanh"
+         )
+ 
++    @skipIfNoXNNPACK
+     def test_fuse_activation_with_pack_ops_linear_conv2d_3(self):
+         self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
+             linear_activation=F.relu,
+@@ -252,6 +258,7 @@ class TestOptimizeForMobilePreserveDebugInfo(JitTestCase):
+             conv2d_activation_kind="aten::relu_"
+         )
+ 
++    @skipIfNoXNNPACK
+     def test_fuse_activation_with_pack_ops_linear_conv2d_4(self):
+         self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
+             linear_activation=F.relu_,
+diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py
+index 19f07e2454..21ae5d3ee9 100644
+--- a/test/test_mobile_optimizer.py
++++ b/test/test_mobile_optimizer.py
+@@ -1,9 +1,8 @@
+ import unittest
+ import torch
+ import torch.nn as nn
+-import torch.backends.xnnpack
+ import torch.utils.bundled_inputs
+-from torch.testing._internal.common_utils import TestCase, run_tests
++from torch.testing._internal.common_utils import TestCase, run_tests, skipIfNoXNNPACK
+ from torch.testing._internal.jit_utils import get_forward, get_forward_graph
+ from torch.utils.mobile_optimizer import (LintCode,
+                                           generate_mobile_module_lints,
+@@ -22,9 +21,7 @@ FileCheck = torch._C.FileCheck
+ 
+ class TestOptimizer(TestCase):
+ 
+-    @unittest.skipUnless(torch.backends.xnnpack.enabled,
+-                         " XNNPACK must be enabled for these tests."
+-                         " Please build with USE_XNNPACK=1.")
++    @skipIfNoXNNPACK
+     def test_optimize_for_mobile(self):
+         batch_size = 2
+         input_channels_per_group = 6
+@@ -263,9 +260,7 @@ class TestOptimizer(TestCase):
+             rtol=1e-2,
+             atol=1e-3)
+ 
+-    @unittest.skipUnless(torch.backends.xnnpack.enabled,
+-                         " XNNPACK must be enabled for these tests."
+-                         " Please build with USE_XNNPACK=1.")
++    @skipIfNoXNNPACK
+     def test_quantized_conv_no_asan_failures(self):
+         # There were ASAN failures when fold_conv_bn was run on
+         # already quantized conv modules. Verifying that this does
+@@ -359,6 +354,7 @@ class TestOptimizer(TestCase):
+         bi_module_lint_list = generate_mobile_module_lints(bi_module)
+         self.assertEqual(len(bi_module_lint_list), 0)
+ 
++    @skipIfNoXNNPACK
+     def test_preserve_bundled_inputs_methods(self):
+         class MyBundledInputModule(torch.nn.Module):
+             def __init__(self):
+@@ -413,9 +409,7 @@ class TestOptimizer(TestCase):
+         incomplete_bi_module_optim = optimize_for_mobile(incomplete_bi_module, preserved_methods=['get_all_bundled_inputs'])
+         self.assertTrue(hasattr(incomplete_bi_module_optim, 'get_all_bundled_inputs'))
+ 
+-    @unittest.skipUnless(torch.backends.xnnpack.enabled,
+-                         " XNNPACK must be enabled for these tests."
+-                         " Please build with USE_XNNPACK=1.")
++    @skipIfNoXNNPACK
+     def test_hoist_conv_packed_params(self):
+ 
+         if 'qnnpack' not in torch.backends.quantized.supported_engines:
+@@ -509,6 +503,7 @@ class TestOptimizer(TestCase):
+             m_optim_res = m_optim(data)
+             torch.testing.assert_close(m_res, m_optim_res, rtol=1e-2, atol=1e-3)
+ 
++    @skipIfNoXNNPACK
+     @unittest.skipUnless(HAS_TORCHVISION, "Needs torchvision")
+     def test_mobilenet_optimize_for_mobile(self):
+         m = torchvision.models.mobilenet_v3_small()
+diff --git a/test/test_model_dump.py b/test/test_model_dump.py
+index 417bb2a91a..e4a9ffa1bd 100644
+--- a/test/test_model_dump.py
++++ b/test/test_model_dump.py
+@@ -8,9 +8,10 @@ import urllib
+ import unittest
+ 
+ import torch
++import torch.backends.xnnpack
+ import torch.utils.model_dump
+ import torch.utils.mobile_optimizer
+-from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS
++from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS, skipIfNoXNNPACK
+ from torch.testing._internal.common_quantized import supported_qengines
+ 
+ 
+@@ -170,6 +171,7 @@ class TestModelDump(TestCase):
+         qmodel = self.get_quant_model()
+         self.do_dump_model(torch.jit.script(qmodel))
+ 
++    @skipIfNoXNNPACK
+     @unittest.skipUnless("qnnpack" in supported_qengines, "QNNPACK not available")
+     def test_optimized_quantized_model(self):
+         qmodel = self.get_quant_model()
+diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
+index a0f8328ec6..a737ed4b8b 100644
+--- a/test/test_xnnpack_integration.py
++++ b/test/test_xnnpack_integration.py
+@@ -51,7 +51,6 @@ class TestXNNPACKOps(TestCase):
+         output_linearprepacked = torch.ops.prepacked.linear_clamp_run(input_data, packed_weight_bias)
+         torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3)
+ 
+-
+     @given(batch_size=st.integers(0, 3),
+            input_channels_per_group=st.integers(1, 32),
+            height=st.integers(5, 64),
+diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
+index b4ef79620d..18d8925d6a 100644
+--- a/torch/testing/_internal/common_utils.py
++++ b/torch/testing/_internal/common_utils.py
+@@ -56,6 +56,7 @@ from torch._six import string_classes
+ from torch import Tensor
+ import torch.backends.cudnn
+ import torch.backends.mkl
++import torch.backends.xnnpack
+ from enum import Enum
+ 
+ torch.backends.disable_global_flags()
+@@ -908,6 +909,14 @@ def _test_function(fn, device):
+         return fn(self, device)
+     return run_test_function
+ 
++def skipIfNoXNNPACK(fn):
++    @wraps(fn)
++    def wrapper(*args, **kwargs):
++        if not torch.backends.xnnpack.enabled:
++            raise unittest.SkipTest('XNNPACK must be enabled for these tests. Please build with USE_XNNPACK=1.')
++        else:
++            fn(*args, **kwargs)
++    return wrapper
+ 
+ def skipIfNoLapack(fn):
+     @wraps(fn)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-kineto-crash.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-kineto-crash.patch
new file mode 100644
index 000000000000..7858aa1f8cf8
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-kineto-crash.patch
@@ -0,0 +1,24 @@
+Fix a crash during application shutdown visible in test_profiler on some machines.
+See https://github.com/pytorch/kineto/pull/642
+
+Author: Alexander Grund (TU Dresden)
+
+diff -aur a/third_party/kineto/libkineto/src/EventProfilerController.cpp b/third_party/kineto/libkineto/src/EventProfilerController.cpp
+--- a/third_party/kineto/libkineto/src/EventProfilerController.cpp	2022-08-05 13:10:46.175716618 +0200
++++ b/third_party/kineto/libkineto/src/EventProfilerController.cpp	2022-08-05 13:16:00.654118490 +0200
+@@ -233,9 +233,14 @@ EventProfilerController::~EventProfilerController() {
+ 
+ // Must be called under lock
+ void EventProfilerController::start(CUcontext ctx, ConfigLoader& configLoader) {
+-  profilerMap()[ctx] = unique_ptr<EventProfilerController>(
++  // Avoid static initialization order fiasco:
++  // We need the profilerMap and with it all controllers to be destroyed
++  // before everything the controller accesses gets destroyed.
++  // Hence access the profilerMap after initialization of the controller.
++  auto controller = unique_ptr<EventProfilerController>(
+       new EventProfilerController(
+           ctx, configLoader, detail::HeartbeatMonitor::instance()));
++  profilerMap()[ctx] = std::move(controller);
+ }
+ 
+ // Must be called under lock
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-test-model_dump.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-test-model_dump.patch
new file mode 100644
index 000000000000..1891c93e93ff
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-test-model_dump.patch
@@ -0,0 +1,36 @@
+From https://github.com/pytorch/pytorch/pull/84744
+
+Author: Alexander Grund (TU Dresden)
+
+From 4d99f72dcf71bffa7ff4275750cb86d872ac653f Mon Sep 17 00:00:00 2001
+From: Alexander Grund <alexander.grund@tu-dresden.de>
+Date: Fri, 9 Sep 2022 11:10:05 +0200
+Subject: [PATCH] Fix failing test_model_dump due to empty file
+
+The `torch.jit.save` call on a file object may not actually write the
+data to disk due to buffering. The call to `model_dump.main` on that
+file will when fail with an error like
+
+> zipfile.BadZipFile: File is not a zip file
+
+Inspecting the file confirms that it is either empty (usually) or
+incomplete (possible).
+
+Fix this by flushing the file after saving the model.
+---
+ test/test_model_dump.py | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/test/test_model_dump.py b/test/test_model_dump.py
+index a8add0e2cd923..3c682b6ce6805 100644
+--- a/test/test_model_dump.py
++++ b/test/test_model_dump.py
+@@ -131,6 +131,8 @@ def test_main(self):
+ 
+         with tempfile.NamedTemporaryFile() as tf:
+             torch.jit.save(torch.jit.script(SimpleModel()), tf)
++            # Actually write contents to disk so we can read it below
++            tf.flush()
+ 
+             stdout = io.StringIO()
+             torch.utils.model_dump.main(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-vsx-vector-functions.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-vsx-vector-functions.patch
new file mode 100644
index 000000000000..7d35ac8d1344
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-vsx-vector-functions.patch
@@ -0,0 +1,39 @@
+This fixes the remaining bug introduced by the VSX optimized code in https://github.com/pytorch/pytorch/pull/41541
+
+See https://github.com/pytorch/pytorch/pull/59382
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+index 2427276bce..2b46e0a662 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
++++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+@@ -558,27 +558,7 @@ class Vectorized<float> {
+   }
+ 
+   Vectorized<float> C10_ALWAYS_INLINE pow(const Vectorized<float>& exp) const {
+-    auto x = *this;
+-    auto sign_bit = (*this) & sign_mask;
+-    // |b|
+-    auto exp_abs = exp.abs();
+-    auto exp_trunc = exp.trunc();
+-    Vectorized<float> odd_mask;
+-    odd_mask._vecb0 = (vec_signed(exp._vec0) & vi_1) != vi_0;
+-    odd_mask._vecb1 = (vec_signed(exp._vec1) & vi_1) != vi_0;
+-    // using ln fuction
+-    auto temp = (abs().log() * exp).exp();
+-
+-    // is odd or even check from Sleef
+-    auto is_int = (exp == exp_trunc) | (exp_abs >= vcheck);
+-    auto is_odd = odd_mask & is_int & (exp_abs < vcheck);
+-    // if even then then pow result should be absolute
+-    auto temp_sign = temp | sign_bit; // copy_sign
+-    auto out = blendv(temp, temp_sign, is_odd);
+-    // x<0 and y != N, then NAN
+-    auto out1 = blendv(out, v_nan, ((exp.floor() != exp) & (x < zero)));
+-    // y = 0 then 1
+-    return blendv(out1, one, (exp_abs == zero));
++    return {Sleef_powf4_u10vsx(_vec0, exp._vec0), Sleef_powf4_u10vsx(_vec1, exp._vec1)};
+   }
+ 
+   Vectorized<float> fmod(const Vectorized<float>& b) const {
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch
new file mode 100644
index 000000000000..ce5acf4a8844
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch
@@ -0,0 +1,30 @@
+When QNNPACK is not available the test will fail trying to use it.
+Skip them conditionally.
+See https://github.com/pytorch/pytorch/pull/82882
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_nnapi.py b/test/test_nnapi.py
+index f8db7e1a3d..1d4fd52e28 100644
+--- a/test/test_nnapi.py
++++ b/test/test_nnapi.py
+@@ -2,8 +2,10 @@
+ import os
+ import ctypes
+ import torch
++import unittest
+ from typing import Tuple
+ from torch.backends._nnapi.prepare import convert_model_to_nnapi
++from torch.testing._internal.common_quantized import supported_qengines
+ from torch.testing._internal.common_utils import TestCase, run_tests
+ 
+ 
+@@ -18,6 +20,8 @@ def nhwc(t):
+     return t
+ 
+ 
++@unittest.skipUnless('qnnpack' in supported_qengines,
++                     "This Pytorch Build has not been built with or does not support QNNPACK")
+ class TestNNAPI(TestCase):
+ 
+     def setUp(self):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb
index 5aba12b39575..8385d91b7f0a 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb
@@ -8,51 +8,75 @@ PyTorch is a deep learning framework that puts Python first."""
 
 toolchain = {'name': 'foss', 'version': '2021a'}
 
-sources = [{
-    'filename': '%(name)s-%(version)s.tar.gz',
-    'git_config': {
-        'url': 'https://github.com/pytorch',
-        'repo_name': 'pytorch',
-        'tag': 'v%(version)s',
-        'recursive': True,
-    },
-}]
+source_urls = ['https://github.com/%(github_account)s/%(namelower)s/releases/download/v%(version)s']
+sources = ['%(namelower)s-v%(version)s.tar.gz']
+
 patches = [
     'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
     'PyTorch-1.7.0_disable-dev-shm-test.patch',
     'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch',
-    'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch',
+    'PyTorch-1.10.0_fix-kineto-crash.patch',
     'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
+    'PyTorch-1.10.0_fix-test-model_dump.patch',
+    'PyTorch-1.10.0_fix-vsx-vector-functions.patch',
+    'PyTorch-1.10.0_fix-XNNPACK-tests.patch',
     'PyTorch-1.10.0_skip_cmake_rpath.patch',
-    'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
-    'PyTorch-1.11.0_skip_failing_ops_tests.patch',
-    'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch',
+    'PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch',
+    'PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch',
+    'PyTorch-1.11.0_fix-attention_cpp-compilation.patch',
+    'PyTorch-1.11.0_fix-fsdp-fp16-test.patch',
     'PyTorch-1.11.0_fix_sharded_imports.patch',
-    'PyTorch-1.11.0_increase_test_tolerances_TF32.patch',
+    'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch',
+    'PyTorch-1.11.0_fix-test_utils.patch',
     'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch',
-    'PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch',
+    'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
+    'PyTorch-1.11.0_increase_test_tolerances_TF32.patch',
+    'PyTorch-1.11.0_increase-tolerance-test_ops.patch',
+    'PyTorch-1.11.0_install-vsx-vec-headers.patch',
+    'PyTorch-1.11.0_skip_failing_ops_tests.patch',
+    'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
+    'PyTorch-1.11.1_skip-test_sibling_fusion.patch',
 ]
 checksums = [
-    None,  # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
+    'dc0c2b8d13c112a2b9ea8757a475b0ce2ca97cd19c50a8b70b8c286676616f1d',  # pytorch-v1.11.0.tar.gz
     'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18',  # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
     '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a',  # PyTorch-1.7.0_disable-dev-shm-test.patch
     '89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6',  # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
-    # PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch
-    'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea',
+    # PyTorch-1.10.0_fix-kineto-crash.patch
+    'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb',
     # PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
     '313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
+    # PyTorch-1.10.0_fix-test-model_dump.patch
+    '339148ae1a028cda6e750ac93fa38a599f66c7abe26586c9219f1a206ea14557',
+    # PyTorch-1.10.0_fix-vsx-vector-functions.patch
+    '7bef5f96cb83b2d655d2f76dd7468a171d446f0b3e06da2232ec7f886484d312',
+    # PyTorch-1.10.0_fix-XNNPACK-tests.patch
+    'd3e749a2a42efce463e3b8a1aebb21f0edf2256682c4417297d9a44a6210e5f8',
     'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448',  # PyTorch-1.10.0_skip_cmake_rpath.patch
+    # PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch
+    '34ba476a7bcddec323bf9eca083cb4623d0f569d081aa3add3769c24f22849d2',
+    # PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
+    'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51',
+    # PyTorch-1.11.0_fix-attention_cpp-compilation.patch
+    '84214fcc7e30cf70659a7c3bd70bf11e73d58fd4f7fff2c233e3225619b0e42c',
+    'bb1c4e6d6fd4b0cf57ff8b824c797331b533bb1ffc63f5db0bae3aee10c3dc13',  # PyTorch-1.11.0_fix-fsdp-fp16-test.patch
+    '9a04f4285b800dad8a00c3014af0a9713d40d5dd35d10931c7c0da4e89c558e9',  # PyTorch-1.11.0_fix_sharded_imports.patch
+    '21fc678febcdfbb9dabd72235be23cd392044e9a954f6580d15b530e1f69dcc1',  # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch
+    '4f7e25c4e2eb7094f92607df74488c6a4a35849fabf05fcf6c3655fa3f44a861',  # PyTorch-1.11.0_fix-test_utils.patch
+    # PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
+    '20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953',
     # PyTorch-1.11.0_increase-distributed-test-timeout.patch
     '087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f',
-    '8eaca92d64fcadb0552d28e9c7ea5c4bc669d2fe33004e45a3519ce8d0d136a2',  # PyTorch-1.11.0_skip_failing_ops_tests.patch
-    '21fc678febcdfbb9dabd72235be23cd392044e9a954f6580d15b530e1f69dcc1',  # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch
-    '9a04f4285b800dad8a00c3014af0a9713d40d5dd35d10931c7c0da4e89c558e9',  # PyTorch-1.11.0_fix_sharded_imports.patch
     # PyTorch-1.11.0_increase_test_tolerances_TF32.patch
     '26e179a4f6f57e49209092612ae5f5cd8c03fd2ca84566ba0244eabefc3736ba',
-    # PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
-    '20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953',
-    # PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
-    'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51',
+    # PyTorch-1.11.0_increase-tolerance-test_ops.patch
+    'ceec745c68a405bba79efb4dc61c662ca84eb950cd0163c7104330f4bf614cf5',
+    'f2e6b9625733d9a471bb75e1ea20e28814cf1380b4f9089aa838ee35ddecf07d',  # PyTorch-1.11.0_install-vsx-vec-headers.patch
+    '8eaca92d64fcadb0552d28e9c7ea5c4bc669d2fe33004e45a3519ce8d0d136a2',  # PyTorch-1.11.0_skip_failing_ops_tests.patch
+    # PyTorch-1.11.1_skip-test_init_from_local_shards.patch
+    '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7',
+    # PyTorch-1.11.1_skip-test_sibling_fusion.patch
+    '3d6f6395d98e8e4ad76b0b63c625fddf082cf7f066eb97d4d82401f96dab2555',
 ]
 
 osdependencies = [OS_PKG_IBVERBS_DEV]
@@ -93,16 +117,8 @@ excluded_tests = {
         # Bad tests: https://github.com/pytorch/pytorch/issues/60260
         'distributed/elastic/utils/distributed_test',
         'distributed/elastic/multiprocessing/api_test',
-        # These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
-        # Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
-        # 'distributed/test_distributed_fork',
-        'distributed/test_distributed_spawn',
         # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
         'test_optim',
-        # Test from this suite timeout often. The process group backend is deprecated anyway
-        # 'distributed/rpc/test_process_group_agent',
-        # This test fails constently when run as part of the test suite, but succeeds when run interactively
-        'test_model_dump',
         # These tests appear flaky, possibly related to number of GPUs that are used
         'distributed/fsdp/test_fsdp_memory',
         'distributed/fsdp/test_fsdp_overlap',
@@ -112,10 +128,10 @@ excluded_tests = {
 runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error  --verbose %(excluded_tests)s'
 
 # several tests are known to be flaky, and fail in some contexts (like having multiple GPUs available),
-# so we allow up to 10 (out of ~90k) tests to fail before treating the installation to be faulty
-max_failed_tests = 10
+# so we allow some (out of ~90k) tests to fail before treating the installation to be faulty
+max_failed_tests = 1
 
-# The readelf sanity check command can be taken out once the TestRPATH test from 
+# The readelf sanity check command can be taken out once the TestRPATH test from
 # https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite
 local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
 sanity_check_commands = [
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-attention_cpp-compilation.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-attention_cpp-compilation.patch
new file mode 100644
index 000000000000..751e954f75fb
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-attention_cpp-compilation.patch
@@ -0,0 +1,61 @@
+Fix failure to compile attention.cpp on some platforms, e.g. POWER.
+I.e. those 3 errors:
+
+../aten/src/ATen/native/attention.cpp:145:46: error: call to deleted function 'double& at::vec::CPU_CAPABILITY::Vectorized<double>::operator[](int)'
+  145 |                   hmax = std::max(max_input[i], hmax);
+[snip]
+../aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h:190:11: note: declared here
+  190 |   double& operator[](int idx) = delete;
+
+../aten/src/ATen/native/attention.cpp:152:66: error: call to deleted function 'double& at::vec::CPU_CAPABILITY::Vectorized<double>::operator[](int)'
+  152 |                     hsum += std::exp(static_cast<accscalar_t>(v[i]) - hmax);
+
+../aten/src/ATen/native/attention.cpp:168:100: error: call to deleted function 'double& at::vec::CPU_CAPABILITY::Vectorized<double>::operator[](int)'
+  168 |                     input_data[t + i] = static_cast<scalar_t>(std::exp(static_cast<accscalar_t>(v[i]) - hmax) * inv_denominator);
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp
+index 188de5e2cd..7f3e8e5af2 100644
+--- a/aten/src/ATen/native/attention.cpp
++++ b/aten/src/ATen/native/attention.cpp
+@@ -140,22 +140,18 @@ void masked_softmax_dropout(
+                   max_input = vec::maximum(max_input, v);
+                 }
+ 
++                __at_align__ scalar_t arr_max_input[V];
++                max_input.store(arr_max_input);
+                 auto hmax = std::numeric_limits<scalar_t>::lowest();
+                 for (auto i = 0; i < V; ++i) {
+-                  hmax = std::max(max_input[i], hmax);
++                  hmax = std::max(arr_max_input[i], hmax);
+                 }
+                 accscalar_t hsum = 0;
+-                for (auto t = 0; t < T; t += V) {
+-                  auto v = Vec::loadu(&input_data[t]);
+-                  // TODO: vectorize in accscalar_t?
+-                  for (auto i = 0; i < V; ++i) {
+-                    hsum += std::exp(static_cast<accscalar_t>(v[i]) - hmax);
+-                  }
++                for (auto t = 0; t < T; ++t) {
++                  hsum += std::exp(static_cast<accscalar_t>(input_data[t]) - hmax);
+                 }
+                 auto inv_denominator = 1.0 / hsum;
+-                for (auto t = 0; t < T; t += V) {
+-                  Vec v = Vec::loadu(&input_data[t]);
+-
++                for (auto t = 0; t < T; ++t) {
+                   // TODO: vectorize in accscalar_t?
+                   // TODO this faster solution does not work on Android build
+                   /*
+@@ -164,9 +160,7 @@ void masked_softmax_dropout(
+                   }
+                   v.store(&input_data[t]);
+                   */
+-                  for (auto i = 0; i < V; ++i) {
+-                    input_data[t + i] = static_cast<scalar_t>(std::exp(static_cast<accscalar_t>(v[i]) - hmax) * inv_denominator);
+-                  }
++                  input_data[t] = static_cast<scalar_t>(std::exp(static_cast<accscalar_t>(input_data[t]) - hmax) * inv_denominator);
+                 }
+               }
+             });
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fsdp-fp16-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fsdp-fp16-test.patch
new file mode 100644
index 000000000000..dc9159e5cf92
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fsdp-fp16-test.patch
@@ -0,0 +1,35 @@
+The test fails on a node with more than 5 V100 GPUs or more than 4 A100 GPUs.
+Hence limit the world_size to 4
+See https://github.com/pytorch/pytorch/issues/78975
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
+index ff0fbb69e3..6bf3e71fde 100644
+--- a/torch/testing/_internal/common_fsdp.py
++++ b/torch/testing/_internal/common_fsdp.py
+@@ -324,7 +324,7 @@ class FSDPTest(MultiProcessTestCase):
+ 
+     @property
+     def world_size(self):
+-        return torch.cuda.device_count() if torch.cuda.is_available() else 4
++        return min(4, torch.cuda.device_count()) if torch.cuda.is_available() else 4
+ 
+     @property
+     def init_method(self):
+diff --git a/test/distributed/fsdp/test_fsdp_pure_fp16.py b/test/distributed/fsdp/test_fsdp_pure_fp16.py
+index eea03bea3d..d3a4bb8257 100644
+--- a/test/distributed/fsdp/test_fsdp_pure_fp16.py
++++ b/test/distributed/fsdp/test_fsdp_pure_fp16.py
+@@ -32,6 +32,11 @@ if TEST_WITH_DEV_DBG_ASAN:
+ 
+ class TestPureFP16(FSDPTest):
+ 
++    @property
++    def world_size(self):
++        # Test fails due to inaccuracies when using more than 4 GPUs
++        return min(4, super().world_size)
++
+     @skip_if_lt_x_gpu(2)
+     @parametrize(
+         "cpu_offload",
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-test_utils.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-test_utils.patch
new file mode 100644
index 000000000000..a42166c927c1
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-test_utils.patch
@@ -0,0 +1,20 @@
+The function tested is dependent on the current locale and may fail on non-English systems.
+See https://github.com/pytorch/pytorch/pull/85891
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
+index 9c6635f111..d99aae1ca2 100644
+--- a/torch/utils/cpp_extension.py
++++ b/torch/utils/cpp_extension.py
+@@ -292,7 +292,9 @@ def check_compiler_ok_for_platform(compiler: str) -> bool:
+     if any(name in compiler_path for name in _accepted_compilers_for_platform()):
+         return True
+     # If compiler wrapper is used try to infer the actual compiler by invoking it with -v flag
+-    version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT).decode(*SUBPROCESS_DECODE_ARGS)
++    env = os.environ.copy()
++    env['LC_ALL'] = 'C'  # Don't localize output
++    version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS)
+     if IS_LINUX:
+         # Check for 'gcc' or 'g++' for sccache warpper
+         pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_increase-tolerance-test_ops.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_increase-tolerance-test_ops.patch
new file mode 100644
index 000000000000..69d58f7a3349
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_increase-tolerance-test_ops.patch
@@ -0,0 +1,22 @@
+`test_out` may fail due to slightly different values caused by different order of matrizes in SGEMM:
+
+> Mismatched elements: 1 / 50 (2.0%)
+> Greatest absolute difference: 1.430511474609375e-05 at index (4, 5) (up to 1e-05 allowed)
+> Greatest relative difference: 4.65393206065873e-06 at index (4, 5) (up to 1.3e-06 allowed)
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_ops.py b/test/test_ops.py
+index 291361d53d..2e0597376d 100644
+--- a/test/test_ops.py
++++ b/test/test_ops.py
+@@ -418,6 +418,9 @@ class TestCommon(TestCase):
+             self.skipTest("Skipped! Op has not supported dtypes on this device.")
+         dtype = torch.float32 if torch.float32 in supported_dtypes else list(supported_dtypes)[0]
+ 
++        if dtype is torch.float32:
++            self.precision, self.rel_tol = (1.5e-05, 1e-05)
++
+         # NOTE: only tests on first sample
+         samples = op.sample_inputs(device, dtype)
+         sample = first_sample(self, samples)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_install-vsx-vec-headers.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_install-vsx-vec-headers.patch
new file mode 100644
index 000000000000..628a59ec0a2e
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_install-vsx-vec-headers.patch
@@ -0,0 +1,38 @@
+Add missing headers to the installation which fixes e.g. test_cpp_extensions_aot_ninja
+See https://github.com/pytorch/pytorch/pull/85547
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index 5710eca27c..0cd15c2ad2 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -56,7 +56,7 @@ if(NOT BUILD_CAFFE2 AND NOT BUILD_LITE_INTERPRETER)
+   EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS})
+ endif()
+ 
+-file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/*.h" "quantized/*.h")
++file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/*.h" "quantized/*.h")
+ file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp")
+ file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
+ file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
+diff --git a/setup.py b/setup.py
+index f1ebab7391..92de0367ac 100644
+--- a/setup.py
++++ b/setup.py
+@@ -942,6 +942,7 @@ if __name__ == '__main__':
+                 'include/ATen/*.h',
+                 'include/ATen/cpu/*.h',
+                 'include/ATen/cpu/vec/vec256/*.h',
++                'include/ATen/cpu/vec/vec256/vsx/*.h',
+                 'include/ATen/cpu/vec/vec512/*.h',
+                 'include/ATen/cpu/vec/*.h',
+                 'include/ATen/core/*.h',
+@@ -1043,6 +1044,7 @@ if __name__ == '__main__':
+                 'include/THH/*.cuh',
+                 'include/THH/*.h*',
+                 'include/THH/generic/*.h',
++                'include/sleef.h',
+                 'share/cmake/ATen/*.cmake',
+                 'share/cmake/Caffe2/*.cmake',
+                 'share/cmake/Caffe2/public/*.cmake',
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_init_from_local_shards.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_init_from_local_shards.patch
new file mode 100644
index 000000000000..c63d7193b7ec
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_init_from_local_shards.patch
@@ -0,0 +1,25 @@
+The test often times out and seems to be considered flaky by PyTorch:
+https://github.com/pytorch/pytorch/issues/78068
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+index cbad9458ae..72a36e0e9a 100644
+--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
++++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+@@ -6,6 +6,7 @@ import io
+ import itertools
+ import pickle
+ import sys
++from unittest import skip
+ import torch
+ import torch.distributed as dist
+ from torch.distributed import rpc
+@@ -1817,6 +1818,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
+     @with_comms
+     @skip_if_lt_x_gpu(4)
+     @requires_nccl()
++    @skip("Times out often")
+     def test_init_from_local_shards(self):
+         local_shard_metadata = ShardMetadata(
+             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_sibling_fusion.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_sibling_fusion.patch
new file mode 100644
index 000000000000..6845706f4848
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.1_skip-test_sibling_fusion.patch
@@ -0,0 +1,17 @@
+The test fails on pre-volta devices as fusing the at::sum/at::add
+is skipped if the device is not of compute capability >= 7
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
+index c03ff0b311..feb6e281f9 100644
+--- a/test/test_jit_cuda_fuser.py
++++ b/test/test_jit_cuda_fuser.py
+@@ -3053,6 +3054,7 @@ class TestCudaFuser(JitTestCase):
+             self.assertGraphContainsExactly(graph, op, 0)
+ 
+     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
++    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                      "Requires fusion optimization pass to be effective")
+     def test_sibling_fusion(self):