diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch
new file mode 100644
index 00000000000..e40b2cccf53
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch
@@ -0,0 +1,41 @@
+Disallow TF32 on tests with thresholds too strict for this data type. Nvidia
+GPUs with TF32 support default to this data type instead of regular FP32 to
+improve performance at the expense of precision.
+author: Alex Domingo (Vrije Universiteit Brussel)
+--- test/test_nn.py.orig	2024-01-15 14:07:35.421908795 +0100
++++ test/test_nn.py	2024-01-15 14:54:00.867537101 +0100
+@@ -3762,6 +3761,7 @@
+             self.assertEqual(weight_data, all_vars[4].data)
+ 
+     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
++    @torch.backends.cudnn.flags(enabled=True, allow_tf32=False)
+     def test_cudnn_weight_tying(self):
+         rnns = [
+             nn.LSTM(10, 20, batch_first=True, bidirectional=True),
+@@ -4461,6 +4461,7 @@
+         self._test_RNN_cpu_vs_cudnn(1)
+ 
+     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
++    @torch.backends.cudnn.flags(enabled=True, allow_tf32=False)
+     def test_RNN_cudnn_weight_norm(self):
+         input_size = 10
+         hidden_size = 6
+@@ -4492,6 +4493,7 @@
+         check_weight_norm(nn.LSTM(input_size, hidden_size, num_layers, proj_size=3), 'weight_hr_l0')
+ 
+     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
++    @torch.backends.cudnn.flags(enabled=True, allow_tf32=False)
+     def test_partial_flat_weights(self):
+         input_size = 10
+         hidden_size = 6
+--- ../PyTorch/2.1.2/foss-2023a-CUDA-12.1.1/pytorch-v2.1.2/test/nn/test_convolution.py	2023-12-15 03:03:27.000000000 +0100
++++ test/nn/test_convolution.py	2024-01-15 15:03:15.606208376 +0100
+@@ -518,7 +518,7 @@
+     # Covering special case when group > 1, input-channel / group < 16 and output-channel is multiple of 16
+     # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686
+     # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
+-    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
++    @torch.backends.cudnn.flags(enabled=True, benchmark=False, allow_tf32=False)
+     def test_Conv2d_groups_nobias_v2(self):
+         torch.manual_seed(123)
+         dev_dtypes = [("cpu", torch.float)]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_skip-test-linalg-svd-complex.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_skip-test-linalg-svd-complex.patch
new file mode 100644
index 00000000000..92ea36337eb
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_skip-test-linalg-svd-complex.patch
@@ -0,0 +1,19 @@
+Skip test_python_ref_meta__refs_linalg_svd_cpu_complex
+Result varies depending on underlying device
+see https://github.com/pytorch/pytorch/issues/105068
+author: Alex Domingo (Vrije Universiteit Brussel)
+--- test/test_ops.py.orig	2024-01-16 15:37:02.596411122 +0100
++++ test/test_ops.py	2024-01-16 15:39:02.824489395 +0100
+@@ -311,6 +311,12 @@
+                 return out
+             return x
+ 
++        # Skip test_python_ref_meta__refs_linalg_svd_cpu_complex
++        # Result varies depending on underlying device
++        # see https://github.com/pytorch/pytorch/issues/105068
++        if op.name == '_refs.linalg.svd' and dtype in (torch.complex64, torch.complex128):
++            self.skipTest("Unreliable on certain devices, see issue #105068")
++
+         # TODO: iterate over requires_grad true/false
+         for sample in op.reference_inputs(device, dtype, requires_grad=False):
+             result = op(sample.input, *sample.args, **sample.kwargs)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb
new file mode 100644
index 00000000000..cb05d00c73e
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb
@@ -0,0 +1,221 @@
+name = 'PyTorch'
+version = '2.1.2'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+
+toolchain = {'name': 'foss', 'version': '2023a'}
+
+source_urls = [GITHUB_RELEASE]
+sources = ['%(namelower)s-v%(version)s.tar.gz']
+patches = [
+    'PyTorch-1.7.0_disable-dev-shm-test.patch',
+    'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
+    'PyTorch-1.12.1_add-hypothesis-suppression.patch',
+    'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
+    'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
+    'PyTorch-1.12.1_skip-test_round_robin.patch',
+    'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
+    'PyTorch-1.13.1_fix-protobuf-dependency.patch',
+    'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
+    'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
+    'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
+    'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
+    'PyTorch-2.0.1_fix-skip-decorators.patch',
+    'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch',
+    'PyTorch-2.0.1_fix-vsx-loadu.patch',
+    'PyTorch-2.0.1_no-cuda-stubs-rpath.patch',
+    'PyTorch-2.0.1_skip-failing-gradtest.patch',
+    'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
+    'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
+    'PyTorch-2.1.0_disable-gcc12-warning.patch',
+    'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch',
+    'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch',
+    'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch',
+    'PyTorch-2.1.0_fix-validationError-output-test.patch',
+    'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch',
+    'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch',
+    'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch',
+    'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
+    'PyTorch-2.1.0_skip-diff-test-on-ppc.patch',
+    'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch',
+    'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch',
+    'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch',
+    'PyTorch-2.1.0_skip-test_wrap_bad.patch',
+    'PyTorch-2.1.2_add-cuda-skip-markers.patch',
+    'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch',
+    'PyTorch-2.1.2_fix-device-mesh-check.patch',
+    'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch',
+    'PyTorch-2.1.2_fix-test_memory_profiler.patch',
+    'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch',
+    'PyTorch-2.1.2_fix-vsx-vector-abs.patch',
+    'PyTorch-2.1.2_fix-vsx-vector-div.patch',
+    'PyTorch-2.1.2_relax-cuda-tolerances.patch',
+    'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch',
+    'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch',
+    'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch',
+    'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch',
+]
+checksums = [
+    {'pytorch-v2.1.2.tar.gz': '85effbcce037bffa290aea775c9a4bad5f769cb229583450c40055501ee1acd7'},
+    {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
+    {'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
+     '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
+    {'PyTorch-1.12.1_add-hypothesis-suppression.patch':
+     'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
+    {'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
+     '1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
+    {'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
+    {'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
+    {'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
+     '5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
+    {'PyTorch-1.13.1_fix-protobuf-dependency.patch':
+     '8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
+    {'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
+     'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
+    {'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
+     '72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
+    {'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
+     '481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
+    {'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
+     '02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
+    {'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'},
+    {'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch':
+     '1b37194f55ae678f3657b8728dfb896c18ffe8babe90987ce468c4fa9274f357'},
+    {'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'},
+    {'PyTorch-2.0.1_no-cuda-stubs-rpath.patch': '8902e58a762240f24cdbf0182e99ccdfc2a93492869352fcb4ca0ec7e407f83a'},
+    {'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'},
+    {'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch':
+     '7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'},
+    {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
+     '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
+    {'PyTorch-2.1.0_disable-gcc12-warning.patch': 'c858b8db0010f41005dc06f9a50768d0d3dc2d2d499ccbdd5faf8a518869a421'},
+    {'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch':
+     'd895018ebdfd46e65d9f7645444a3b4c5bbfe3d533a08db559a04be34e01e478'},
+    {'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch':
+     'b15b1291a3c37bf6a4982cfbb3483f693acb46a67bc0912b383fd98baf540ccf'},
+    {'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch':
+     '84bb51a719abc677031a7a3dfe4382ff098b0cbd8b39b8bed2a7fa03f80ac1e9'},
+    {'PyTorch-2.1.0_fix-validationError-output-test.patch':
+     '7eba0942afb121ed92fac30d1529447d892a89eb3d53c565f8e9d480e95f692b'},
+    {'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch':
+     '3793b4b878be1abe7791efcbd534774b87862cfe7dc4774ca8729b6cabb39e7e'},
+    {'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch':
+     'aef38adf1210d0c5455e91d7c7a9d9e5caad3ae568301e0ba9fc204309438e7b'},
+    {'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch':
+     '0ac36411e76506b3354c85a8a1260987f66af947ee52ffc64230aee1fa02ea8b'},
+    {'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
+     '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
+    {'PyTorch-2.1.0_skip-diff-test-on-ppc.patch': '394157dbe565ffcbc1821cd63d05930957412156cc01e949ef3d3524176a1dda'},
+    {'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch':
+     '6298daf9ddaa8542850eee9ea005f28594ab65b1f87af43d8aeca1579a8c4354'},
+    {'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch':
+     '5229ca88a71db7667a90ddc0b809b2c817698bd6e9c5aaabd73d3173cf9b99fe'},
+    {'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch':
+     '5dcc79883b6e3ec0a281a8e110db5e0a5880de843bb05653589891f16473ead5'},
+    {'PyTorch-2.1.0_skip-test_wrap_bad.patch': 'b8583125ee94e553b6f77c4ab4bfa812b89416175dc7e9b7390919f3b485cb63'},
+    {'PyTorch-2.1.2_add-cuda-skip-markers.patch': 'd007d6d0cdb533e7d01f503e9055218760123a67c1841c57585385144be18c9a'},
+    {'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch':
+     'c164357efa4ce88095376e590ba508fc1daa87161e1e59544eda56daac7f2847'},
+    {'PyTorch-2.1.2_fix-device-mesh-check.patch': 'c0efc288bf3d9a9a3c8bbd2691348a589a2677ea43880a8c987db91c8de4806b'},
+    {'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch':
+     'cd1455495886a7d6b2d30d48736eb0103fded21e2e36de6baac719b9c52a1c92'},
+    {'PyTorch-2.1.2_fix-test_memory_profiler.patch':
+     '30b0c9355636c0ab3dedae02399789053825dc3835b4d7dac6e696767772b1ce'},
+    {'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch':
+     'a0ef99192ee2ad1509c78a8377023d5be2b5fddb16f84063b7c9a0b53d979090'},
+    {'PyTorch-2.1.2_fix-vsx-vector-abs.patch': 'd67d32407faed7dc1dbab4bba0e2f7de36c3db04560ced35c94caf8d84ade886'},
+    {'PyTorch-2.1.2_fix-vsx-vector-div.patch': '11f497a6892eb49b249a15320e4218e0d7ac8ae4ce67de39e4a018a064ca1acc'},
+    {'PyTorch-2.1.2_relax-cuda-tolerances.patch': '554ad09787f61080fafdb84216e711e32327aa357e2a9c40bb428eb6503dee6e'},
+    {'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch':
+     'e6a1efe3d127fcbf4723476a7a1c01cfcf2ccb16d1fb250f478192623e8b6a15'},
+    {'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch':
+     '7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'},
+    {'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch':
+     '6cf711bf26518550903b09ed4431de9319791e79d61aab065785d6608fd5cc88'},
+    {'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch':
+     'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'},
+]
+
+osdependencies = [OS_PKG_IBVERBS_DEV]
+
+builddependencies = [
+    ('CMake', '3.26.3'),
+    ('hypothesis', '6.82.0'),
+    # For tests
+    ('pytest-flakefinder', '1.1.0'),
+    ('pytest-rerunfailures', '12.0'),
+    ('pytest-shard', '0.1.2'),
+]
+
+dependencies = [
+    ('CUDA', '12.1.1', '', SYSTEM),
+    ('cuDNN', '8.9.2.26', '-CUDA-%(cudaver)s', SYSTEM),
+    ('magma', '2.7.2', '-CUDA-%(cudaver)s'),
+    ('NCCL', '2.18.3', '-CUDA-%(cudaver)s'),
+    ('Ninja', '1.11.1'),  # Required for JIT compilation of C++ extensions
+    ('Python', '3.11.3'),
+    ('Python-bundle-PyPI', '2023.06'),
+    ('protobuf', '24.0'),
+    ('protobuf-python', '4.24.0'),
+    ('pybind11', '2.11.1'),
+    ('SciPy-bundle', '2023.07'),
+    ('PyYAML', '6.0'),
+    ('MPFR', '4.2.0'),
+    ('GMP', '6.2.1'),
+    ('numactl', '2.0.16'),
+    ('FFmpeg', '6.0'),
+    ('Pillow', '10.0.0'),
+    ('expecttest', '0.1.5'),
+    ('networkx', '3.1'),
+    ('sympy', '1.12'),
+    ('Z3', '4.12.2', '-Python-%(pyver)s'),
+]
+
+use_pip = True
+buildcmd = '%(python)s setup.py build'  # Run the (long) build in the build step
+
+excluded_tests = {
+    '': [
+        # This test seems to take too long on NVIDIA Ampere at least.
+        'distributed/test_distributed_spawn',
+        # Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
+        'distributions/test_constraints',
+        # no xdoctest
+        'doctests',
+        # failing on broadwell
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'test_native_mha',
+        # intermittent failures on various systems
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'distributed/rpc/test_tensorpipe_agent',
+        # failures on OmniPath systems, which don't support some optional InfiniBand features
+        # See https://github.com/pytorch/tensorpipe/issues/413
+        'distributed/pipeline/sync/skip/test_gpipe',
+        'distributed/pipeline/sync/skip/test_leak',
+        'distributed/pipeline/sync/test_bugs',
+        'distributed/pipeline/sync/test_inplace',
+        'distributed/pipeline/sync/test_pipe',
+        'distributed/pipeline/sync/test_transparency',
+    ]
+}
+
+runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error  --verbose %(excluded_tests)s'
+
+# Especially test_quantization has a few corner cases that are triggered by the random input values,
+# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
+# So allow a low number of tests to fail as the tests "usually" succeed
+max_failed_tests = 50
+
+# The readelf sanity check command can be taken out once the TestRPATH test from
+# https://github.com/pytorch/pytorch/pull/109493 is accepted, since it is then checked as part of the PyTorch test suite
+local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
+sanity_check_commands = [
+    "readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
+]
+
+tests = ['PyTorch-check-cpp-extension.py']
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_add-cuda-skip-markers.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_add-cuda-skip-markers.patch
new file mode 100644
index 00000000000..4bf18dde737
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_add-cuda-skip-markers.patch
@@ -0,0 +1,59 @@
+distributed/test_inductor_collectives & distributed/test_dynamo_distributed fail when run without GPUs
+with "RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!"
+Skip those in that case.
+See https://github.com/pytorch/pytorch/pull/117741
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
+index c8fae62bd62..6220b62a9a9 100644
+--- a/test/distributed/test_dynamo_distributed.py
++++ b/test/distributed/test_dynamo_distributed.py
+@@ -30,6 +30,7 @@ from torch.testing._internal.common_distributed import (
+     requires_nccl,
+     _dynamo_dist_per_rank_init,
+ )
++from torch.testing._internal.common_utils import requires_cuda
+ import torch._dynamo.logging
+ from torch._dynamo.comptime import comptime
+ 
+@@ -452,6 +453,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
+ 
+ 
+ @requires_nccl()
++@requires_cuda
+ class TestSingleProc(DynamoDistributedSingleProcTestCase):
+     """
+     Test harness initializes dist process group.
+diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
+index 9183e2e9ce4..37149865fd9 100644
+--- a/test/distributed/test_inductor_collectives.py
++++ b/test/distributed/test_inductor_collectives.py
+@@ -19,6 +19,7 @@ from torch.testing._internal.common_distributed import (
+     requires_nccl,
+     skip_if_lt_x_gpu,
+ )
++from torch.testing._internal.common_utils import requires_cuda
+ from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
+ from torch._inductor.utils import has_triton, run_and_get_triton_code
+ import torch._dynamo.logging
+@@ -216,6 +217,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
+ 
+ 
+ @requires_nccl()
++@requires_cuda
+ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
+     """
+     Prefer single-proc test runner for basic tests as it is easier to work with.
+diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
+index 1e18ca2afec..bad5af5212f 100644
+--- a/torch/testing/_internal/common_utils.py
++++ b/torch/testing/_internal/common_utils.py
+@@ -1111,6 +1111,7 @@ if TEST_CUDA and 'NUM_PARALLEL_PROCS' in os.environ:
+     # other libraries take up about 11% of space per process
+     torch.cuda.set_per_process_memory_fraction(round(1 / num_procs - .11, 2))
+ 
++requires_cuda = unittest.skipUnless(torch.cuda.is_available(), "Requires CUDA")
+ 
+ def skipIfCrossRef(fn):
+     @wraps(fn)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch
new file mode 100644
index 00000000000..0fcb86b1624
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch
@@ -0,0 +1,66 @@
+In test_ops.py the tests test_python_ref_meta__refs_linalg_svd_cpu_complex* fail
+when PyTorch is compiled with CUDA support with
+> RuntimeError: Conj mismatch! is_conj is set to False and True
+
+This is a known issue (https://github.com/pytorch/pytorch/issues/105068)
+so don't check the flag in the test.
+
+diff --git a/test/test_ops.py b/test/test_ops.py
+index 08a8185d346..dc98e7c7439 100644
+--- a/test/test_ops.py
++++ b/test/test_ops.py
+@@ -302,6 +302,10 @@ class TestCommon(TestCase):
+     @ops(python_ref_db)
+     @skipIfTorchInductor("Takes too long for inductor")
+     def test_python_ref_meta(self, device, dtype, op):
++        CHECK_CONJ_SKIPS = {
++            torch._refs.linalg.svd,
++        }
++
+         with FakeTensorMode() as mode:
+             pass
+ 
+@@ -328,12 +332,12 @@ class TestCommon(TestCase):
+ 
+             if isinstance(result, torch.Tensor):
+                 self.assertTrue(isinstance(meta_result, FakeTensor))
+-                prims.utils.compare_tensor_meta(result, meta_result)
++                prims.utils.compare_tensor_meta(result, meta_result, check_conj=op.op not in CHECK_CONJ_SKIPS)
+             elif isinstance(result, Sequence):
+                 for a, b in zip(result, meta_result):
+                     if isinstance(a, torch.Tensor) or isinstance(b, torch.Tensor):
+                         self.assertTrue(isinstance(b, FakeTensor))
+-                        prims.utils.compare_tensor_meta(a, b)
++                        prims.utils.compare_tensor_meta(a, b, check_conj=op.op not in CHECK_CONJ_SKIPS)
+ 
+     def _ref_test_helper(
+         self,
+diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
+index d60931162da..da217470930 100644
+--- a/torch/_prims_common/__init__.py
++++ b/torch/_prims_common/__init__.py
+@@ -90,7 +90,7 @@ def same_shape(a: ShapeType, b: ShapeType) -> bool:
+ 
+ # TODO: look at using torch.testing.assert_close instead with an option
+ #   to just compare metadata
+-def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType, check_strides=False):
++def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType, check_strides=False, check_conj=True):
+     """
+     Checks that two tensor likes have the same shape,
+     dtype and device.
+@@ -131,10 +131,11 @@ def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType, check_strides=Fals
+             msg = f"Storage offset mismatch! Storage offsets are {a.storage_offset()} and {b.storage_offset()}!"
+             raise RuntimeError(msg)
+ 
+-    if a.is_conj() != b.is_conj():
+-        raise RuntimeError(
+-            f"Conj mismatch! is_conj is set to {a.is_conj()} and {b.is_conj()}"
+-        )
++    if check_conj:
++        if a.is_conj() != b.is_conj():
++            raise RuntimeError(
++                f"Conj mismatch! is_conj is set to {a.is_conj()} and {b.is_conj()}"
++            )
+ 
+     if a.is_neg() != b.is_neg():
+         raise RuntimeError(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-device-mesh-check.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-device-mesh-check.patch
new file mode 100644
index 00000000000..e3458341436
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-device-mesh-check.patch
@@ -0,0 +1,22 @@
+Fix error when there are more GPUs than ranks:
+> RuntimeError: DeviceMesh only support homogeneous hardware, but found 4 ranks and 8 cuda devices!
+
+See https://github.com/pytorch/pytorch/pull/111091
+
+diff --git a/torch/distributed/_tensor/device_mesh.py b/torch/distributed/_tensor/device_mesh.py
+index b5e30eeca82..21ba82503a8 100644
+--- a/torch/distributed/_tensor/device_mesh.py
++++ b/torch/distributed/_tensor/device_mesh.py
+@@ -165,7 +165,10 @@ class DeviceMesh:
+             # automatically set the current cuda/cuda-like device base on num of gpu devices available in each host
+             # NOTE: This device selection would only work for homogeneous hardware.
+             num_devices_per_host = device_handle.device_count()
+-            if world_size % num_devices_per_host != 0:
++            if (
++                world_size > num_devices_per_host
++                and world_size % num_devices_per_host != 0
++            ):
+                 raise RuntimeError(
+                     f"DeviceMesh only support homogeneous hardware, but found "
+                     f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"
+
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_relax-cuda-tolerances.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_relax-cuda-tolerances.patch
new file mode 100644
index 00000000000..7301efcdd10
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_relax-cuda-tolerances.patch
@@ -0,0 +1,22 @@
+test_jit fails in test_freeze_conv_relu_fusion with
+Mismatched elements: 7 / 30 (23.3%)
+Greatest absolute difference: 3.053247928619385e-05 at index (1, 1, 0, 0, 0) (up to 1e-05 allowed)
+Greatest relative difference: 0.0004548609140329063 at index (3, 1, 0, 0, 0) (up to 1.3e-06 allowed)
+
+Increase the tolerance to allow this to pass.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
+index c8c1441adbf..e0feffd6bb5 100644
+--- a/test/jit/test_freezing.py
++++ b/test/jit/test_freezing.py
+@@ -2733,7 +2733,7 @@ class TestFrozenOptimizations(JitTestCase):
+                     else:
+                         FileCheck().check("aten::cudnn_convolution_relu").run(frozen_mod.graph)
+ 
+-                self.assertEqual(mod_eager(inp), frozen_mod(inp))
++                self.assertEqual(mod_eager(inp), frozen_mod(inp), atol=1e-4, rtol=4e-3)
+ 
+     @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
+     def test_freeze_conv_relu_fusion_not_forward(self):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch
new file mode 100644
index 00000000000..6506508c916
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch
@@ -0,0 +1,41 @@
+In some code paths the ProcessGroupNCCL is created when PyTorch was compiled with NCCL.
+However without any GPUs present at runtime the creation will fail with
+> RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
+
+Remove NCCL as a available default backend if CUDA isn't available.
+See https://github.com/pytorch/pytorch/issues/117746
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
+index 098e209264c..80962466bff 100644
+--- a/torch/distributed/distributed_c10d.py
++++ b/torch/distributed/distributed_c10d.py
+@@ -271,9 +271,11 @@ class BackendConfig:
+         if backend == Backend.UNDEFINED:
+             # default config when backend is not specified
+             # supported since PyTorch 2.0
+-            for device in Backend.default_device_backend_map:
+-                if is_backend_available(Backend.default_device_backend_map[device]):
+-                    self.device_backend_map[device] = Backend.default_device_backend_map[device]
++            for device, default_backend in Backend.default_device_backend_map.items():
++                if is_backend_available(default_backend):
++                    if default_backend == Backend.NCCL and not torch.cuda.is_available():
++                        continue
++                    self.device_backend_map[device] = default_backend
+         elif backend.lower() in Backend.backend_list:
+             # Cases for when backend is a single string (without device types)
+             # e.g. "nccl", "gloo", "ucc", "mpi"
+diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
+index a717c875e76..b382ba760f4 100644
+--- a/test/distributed/test_c10d_common.py
++++ b/test/distributed/test_c10d_common.py
+@@ -1775,7 +1775,7 @@ class ProcessGroupWithDispatchedCollectivesTests(MultiProcessTestCase):
+                 if not dist.is_mpi_available():
+                     continue
+             elif backend == dist.Backend.NCCL:
+-                if not dist.is_nccl_available():
++                if not dist.is_nccl_available() or not torch.cuda.is_available():
+                     continue
+             elif backend == dist.Backend.GLOO:
+                 if not dist.is_gloo_available():
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch
new file mode 100644
index 00000000000..41bf3660301
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch
@@ -0,0 +1,36 @@
+test_dtensor_op_db_nn_functional_pad_circular_cpu_float32 may unexpectatly succeed, just skip it.
+Failure is expected until https://github.com/pytorch/pytorch/commit/9378a2ceda8
+test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32 fails with
+> NotImplementedError: Operator aten.constant_pad_nd.default does not have a sharding strategy registered.
+Marked xfail in https://github.com/pytorch/pytorch/commit/49d826bcd3de952eb84a33c89ed399a1a2821c15
+test_dtensor_op_db_empty_strided_cpu_float32 doesn't make sense to run in the first place,
+see https://github.com/pytorch/pytorch/issues/118094
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py
+index b7d453e56be..5a27c7f84da 100644
+--- a/test/distributed/_tensor/test_dtensor_ops.py
++++ b/test/distributed/_tensor/test_dtensor_ops.py
+@@ -147,6 +147,7 @@ dtensor_fails = {
+     xfail("dot"),
+     xfail("einsum"),
+     xfail("empty"),
++    skip("empty_strided"),
+     xfail("empty_like"),
+     xfail("empty_permuted"),
+     xfail("exponential"),
+@@ -359,11 +360,12 @@ dtensor_fails = {
+     xfail("nn.functional.mish"),
+     xfail("nn.functional.mse_loss"),
+     xfail("nn.functional.multi_margin_loss"),
++    skip("nn.functional.multi_head_attention_forward"),
+     xfail("nn.functional.multilabel_margin_loss"),
+     xfail("nn.functional.multilabel_soft_margin_loss"),
+     xfail("nn.functional.nll_loss"),
+     xfail("nn.functional.normalize"),
+-    xfail("nn.functional.pad", "circular"),
++    skip("nn.functional.pad", "circular"),
+     xfail("nn.functional.pad", "constant"),
+     xfail("nn.functional.pad", "reflect"),
+     xfail("nn.functional.pad", "replicate"),