diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch new file mode 100644 index 000000000000..f991b44530b4 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch @@ -0,0 +1,56 @@ +Disallow TF32 on tests with thresholds too strict for this data type. Nvidia +GPUs with TF32 support default to this data type instead of regular FP32 to +improve performance at the expense of precision. +author: Alex Domingo (Vrije Universiteit Brussel) +--- test/test_nn.py.orig 2024-01-15 14:07:35.421908795 +0100 ++++ test/test_nn.py 2024-01-15 14:54:00.867537101 +0100 +@@ -3762,6 +3761,7 @@ + self.assertEqual(weight_data, all_vars[4].data) + + @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') ++ @torch.backends.cudnn.flags(enabled=True, allow_tf32=False) + def test_cudnn_weight_tying(self): + rnns = [ + nn.LSTM(10, 20, batch_first=True, bidirectional=True), +@@ -4461,6 +4461,7 @@ + self._test_RNN_cpu_vs_cudnn(1) + + @unittest.skipIf(not TEST_CUDNN, "needs cudnn") ++ @torch.backends.cudnn.flags(enabled=True, allow_tf32=False) + def test_RNN_cudnn_weight_norm(self): + input_size = 10 + hidden_size = 6 +@@ -4492,6 +4493,7 @@ + check_weight_norm(nn.LSTM(input_size, hidden_size, num_layers, proj_size=3), 'weight_hr_l0') + + @unittest.skipIf(not TEST_CUDA, 'CUDA not available') ++ @torch.backends.cudnn.flags(enabled=True, allow_tf32=False) + def test_partial_flat_weights(self): + input_size = 10 + hidden_size = 6 +--- test/jit/test_freezing.py.orig 2024-01-15 14:38:11.054125484 +0100 ++++ test/jit/test_freezing.py 2024-01-15 14:49:41.689011617 +0100 +@@ -2733,7 +2733,11 @@ + else: + FileCheck().check("aten::cudnn_convolution_relu").run(frozen_mod.graph) + +- self.assertEqual(mod_eager(inp), frozen_mod(inp)) ++ if not TEST_WITH_ROCM: ++ with torch.backends.cudnn.flags(enabled=True, allow_tf32=False): ++ self.assertEqual(mod_eager(inp), frozen_mod(inp)) ++ else: ++ self.assertEqual(mod_eager(inp), frozen_mod(inp)) + + @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN") + def test_freeze_conv_relu_fusion_not_forward(self): +--- ../PyTorch/2.1.2/foss-2023a-CUDA-12.1.1/pytorch-v2.1.2/test/nn/test_convolution.py 2023-12-15 03:03:27.000000000 +0100 ++++ test/nn/test_convolution.py 2024-01-15 15:03:15.606208376 +0100 +@@ -518,7 +518,7 @@ + # Covering special case when group > 1, input-channel / group < 16 and output-channel is multiple of 16 + # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686 + # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024 +- @torch.backends.cudnn.flags(enabled=True, benchmark=False) ++ @torch.backends.cudnn.flags(enabled=True, benchmark=False, allow_tf32=False) + def test_Conv2d_groups_nobias_v2(self): + torch.manual_seed(123) + dev_dtypes = [("cpu", torch.float)] diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_skip-test-linalg-svd-complex.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_skip-test-linalg-svd-complex.patch new file mode 100644 index 000000000000..92ea36337ebb --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_skip-test-linalg-svd-complex.patch @@ -0,0 +1,19 @@ +Skip test_python_ref_meta__refs_linalg_svd_cpu_complex +Result varies depending on underlying device +see https://github.com/pytorch/pytorch/issues/105068 +author: Alex Domingo (Vrije Universiteit Brussel) +--- test/test_ops.py.orig 2024-01-16 15:37:02.596411122 +0100 ++++ test/test_ops.py 2024-01-16 15:39:02.824489395 +0100 +@@ -311,6 +311,12 @@ + return out + return x + ++ # Skip test_python_ref_meta__refs_linalg_svd_cpu_complex ++ # Result varies depending on underlying device ++ # see https://github.com/pytorch/pytorch/issues/105068 ++ if op.name == '_refs.linalg.svd' and dtype in (torch.complex64, torch.complex128): ++ self.skipTest("Unreliable on certain devices, see issue #105068") ++ + # TODO: iterate over requires_grad true/false + for sample in op.reference_inputs(device, dtype, requires_grad=False): + result = op(sample.input, *sample.args, **sample.kwargs) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb index 102b2735fec0..6ffe56d33ed2 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb @@ -43,6 +43,8 @@ patches = [ 'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch', 'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch', 'PyTorch-2.1.0_skip-test_wrap_bad.patch', + 'PyTorch-2.1.0_skip-test-linalg-svd-complex.patch', + 'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch', ] checksums = [ {'pytorch-v2.1.2.tar.gz': '85effbcce037bffa290aea775c9a4bad5f769cb229583450c40055501ee1acd7'}, @@ -100,6 +102,10 @@ checksums = [ {'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch': '5dcc79883b6e3ec0a281a8e110db5e0a5880de843bb05653589891f16473ead5'}, {'PyTorch-2.1.0_skip-test_wrap_bad.patch': 'b8583125ee94e553b6f77c4ab4bfa812b89416175dc7e9b7390919f3b485cb63'}, + {'PyTorch-2.1.0_skip-test-linalg-svd-complex.patch': + '5ba7e0b4203ea8c27b55b5231de024004697aca7bbae30aa248524babb451dc7'}, + {'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch': + '7abccc94f0ae6c317d5d08d4db4e3724eedde8d1d00707e78cf57d8cbf858be5'}, ] osdependencies = [OS_PKG_IBVERBS_DEV]