diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb
index 528fb6715575..a62ed5dddbea 100644
--- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb
@@ -44,6 +44,7 @@ patches = [
     'PyTorch-1.12.1_skip-ao-sparsity-test-without-fbgemm.patch',
     'PyTorch-1.12.1_skip-failing-grad-test.patch',
     'PyTorch-1.12.1_skip-test_round_robin.patch',
+    'PyTorch-1.12.1_use-predefined-data-in-test-optim.patch',
 ]
 checksums = [
     '031c71073db73da732b5d01710220564ce6dd88d812ba053f0cc94296401eccb',  # pytorch-v1.12.1.tar.gz
@@ -99,6 +100,8 @@ checksums = [
     '1c89e7e67287fe6b9a95480a4178d3653b94d0ab2fe68edf227606c8ae548fdc',  # PyTorch-1.12.1_skip-failing-grad-test.patch
     # PyTorch-1.12.1_skip-test_round_robin.patch
     '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349',
+    # PyTorch-1.12.1_use-predefined-data-in-test-optim.patch
+    'a55f5465f5324cddae44416d67ef7506acb3513df7c4efb47db2f19eaa169054',
 ]
 
 osdependencies = [OS_PKG_IBVERBS_DEV]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_use-predefined-data-in-test-optim.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_use-predefined-data-in-test-optim.patch
new file mode 100644
index 000000000000..5dc393c17fd7
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_use-predefined-data-in-test-optim.patch
@@ -0,0 +1,132 @@
+The test test_optim.test_nadam has high error rate, this let's the test use
+predefined data instead to be more stable. See
+https://github.com/pytorch/pytorch/issues/98414
+
+Viktor Rehnberg
+diff --git a/test/test_optim.py b/test/test_optim.py
+index 6d587b4b352..c8ec9db87e1 100644
+--- a/test/test_optim.py
++++ b/test/test_optim.py
+@@ -244,8 +244,14 @@ class TestOptim(TestCase):
+             return set(k for k in obj.__dict__ if not k.startswith('_'))
+         self.assertEqual(getPublicAttr(optimizer), getPublicAttr(deepcopy(optimizer)))
+ 
+-    def _test_basic_cases(self, constructor, scheduler_constructors=None,
+-                          ignore_multidevice=False, constructor_accepts_maximize=False):
++    def _test_basic_cases(
++        self,
++        constructor,
++        scheduler_constructors=None,
++        ignore_multidevice=False,
++        constructor_accepts_maximize=False,
++        use_predefined_data=False,
++    ):
+         if scheduler_constructors is None:
+             scheduler_constructors = []
+ 
+@@ -254,26 +260,60 @@ class TestOptim(TestCase):
+                 return lambda weight, bias: constructor(weight, bias, maximize)
+             return constructor
+ 
++        def make_weight_tensor():
++            if use_predefined_data:
++                return torch.Tensor([
++                    [ 0.6390, -0.5524, -0.1877, -1.1132,  0.3412],
++                    [-0.6489, -0.6220, -1.2537, -0.0966,  0.5481],
++                    [-0.6923,  0.5768, -0.9141,  1.9410,  1.0036],
++                    [ 0.5842,  1.1618, -0.1871,  1.0344,  0.5668],
++                    [ 0.2123,  2.3076,  0.7522, -0.7059,  1.3849],
++                    [-0.1537,  0.5159, -1.2004,  0.2017, -0.0903],
++                    [ 0.9434, -0.7030,  0.0618, -1.2951,  1.7721],
++                    [ 0.5890, -1.0763, -1.2541, -0.8403, -0.4343],
++                    [-0.2065, -0.6883,  0.8464, -0.7792,  0.6750],
++                    [-1.6577,  0.4532,  0.0791,  0.2243,  0.1148],
++                ])
++            else:
++                return torch.randn(10, 5)
++
++        def make_bias_tensor():
++            if use_predefined_data:
++                return torch.Tensor([
++                    -2.4031, -0.9295, -1.0762,  0.4600, -1.8620, -0.6234,  0.1999, -0.0612, 0.8319, -1.6673,
++                ])
++            else:
++                return torch.randn(10)
++
++        def make_input_tensor():
++            if use_predefined_data:
++                return torch.Tensor([1.1119, -0.4309, -0.7759, -0.0659,  0.4746])
++            else:
++                return torch.randn(5)
++
++        def make_non_contiguous(tensor):
++            return torch.stack([tensor, tensor]).view(*tensor.size(), 2)[..., 0]
++
+         for maximize in (True, False):
+             self._test_state_dict(
+-                torch.randn(10, 5),
+-                torch.randn(10),
+-                torch.randn(5),
++                make_weight_tensor(),
++                make_bias_tensor(),
++                make_input_tensor(),
+                 make_two_arg_constructor(constructor, maximize),
+             )
+         self._test_basic_cases_template(
+-            torch.randn(10, 5),
+-            torch.randn(10),
+-            torch.randn(5),
++            make_weight_tensor(),
++            make_bias_tensor(),
++            make_input_tensor(),
+             constructor,
+             scheduler_constructors,
+             constructor_accepts_maximize,
+         )
+         # non-contiguous parameters
+         self._test_basic_cases_template(
+-            torch.randn(10, 5, 2)[..., 0],
+-            torch.randn(10, 2)[..., 0],
+-            torch.randn(5),
++            make_non_contiguous(make_weight_tensor()),
++            make_non_contiguous(make_bias_tensor()),
++            make_input_tensor(),
+             constructor,
+             scheduler_constructors,
+             constructor_accepts_maximize,
+@@ -282,9 +322,9 @@ class TestOptim(TestCase):
+         if not torch.cuda.is_available():
+             return
+         self._test_basic_cases_template(
+-            torch.randn(10, 5).cuda(),
+-            torch.randn(10).cuda(),
+-            torch.randn(5).cuda(),
++            make_weight_tensor().cuda(),
++            make_bias_tensor().cuda(),
++            make_input_tensor().cuda(),
+             constructor,
+             scheduler_constructors,
+             constructor_accepts_maximize,
+@@ -293,9 +333,9 @@ class TestOptim(TestCase):
+         if not torch.cuda.device_count() > 1 or ignore_multidevice:
+             return
+         self._test_basic_cases_template(
+-            torch.randn(10, 5).cuda(0),
+-            torch.randn(10).cuda(1),
+-            torch.randn(5).cuda(0),
++            make_weight_tensor().cuda(0),
++            make_bias_tensor().cuda(1),
++            make_input_tensor().cuda(0),
+             constructor,
+             scheduler_constructors,
+             constructor_accepts_maximize,
+@@ -668,7 +708,8 @@ class TestOptim(TestCase):
+             self._test_basic_cases(
+                 lambda weight, bias: optimizer(
+                     self._build_params_dict(weight, bias, lr=1e-2),
+-                    lr=1e-3)
++                    lr=1e-3),
++                use_predefined_data=True,
+             )
+             self._test_basic_cases(
+                 lambda weight, bias: optimizer([weight, bias], lr=1e-3, weight_decay=0.1, momentum_decay=6e-3)