diff --git a/easybuild/easyconfigs/c/cuDNN/cuDNN-9.10.2.21-CUDA-12.6.0.eb b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.10.2.21-CUDA-12.6.0.eb new file mode 100644 index 00000000000..427e5309a94 --- /dev/null +++ b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.10.2.21-CUDA-12.6.0.eb @@ -0,0 +1,34 @@ +name = 'cuDNN' +version = '9.10.2.21' +versionsuffix = '-CUDA-%(cudaver)s' +homepage = 'https://developer.nvidia.com/cudnn' +description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is +a GPU-accelerated library of primitives for deep neural networks.""" + +toolchain = SYSTEM + +source_urls = [ + 'https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-%(cudnnarch)s/' +] +# note: cuDNN is tied to specific to CUDA versions, +# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions +sources = ['%(namelower)s-linux-%(cudnnarch)s-%(version)s_cuda%(cudamajver)s-archive.tar.xz'] +checksums = [{ + '%(namelower)s-linux-sbsa-%(version)s_cuda%(cudamajver)s-archive.tar.xz': + '4d57dceba3be27a68b078ce8630525bf40ab7f1b546eb45d0b363c3eeb55f8fa', + '%(namelower)s-linux-x86_64-%(version)s_cuda%(cudamajver)s-archive.tar.xz': + 'd0defcbc4c6dad711ff4cb66d254036a300c9071b07c7b64199aacab534313c1', +}] + +dependencies = [('CUDA', '12.6.0')] + +sanity_check_paths = { + 'files': [ + 'include/cudnn.h', 'lib64/libcudnn_adv_static.a', 'lib64/libcudnn_cnn_static.a', + 'lib64/libcudnn_engines_precompiled_static.a', 'lib64/libcudnn_engines_runtime_compiled_static.a', + 'lib64/libcudnn_graph_static.a', 'lib64/libcudnn_heuristic_static.a', 'lib64/libcudnn_ops_static.a', + ], + 'dirs': ['include', 'lib64'], +} + +moduleclass = 'numlib' diff --git a/easybuild/easyconfigs/n/NCCL/NCCL-2.27.5-GCCcore-13.3.0-CUDA-12.6.0.eb b/easybuild/easyconfigs/n/NCCL/NCCL-2.27.5-GCCcore-13.3.0-CUDA-12.6.0.eb new file mode 100644 index 00000000000..4b1bd8f94a1 --- /dev/null +++ b/easybuild/easyconfigs/n/NCCL/NCCL-2.27.5-GCCcore-13.3.0-CUDA-12.6.0.eb @@ -0,0 +1,26 @@ +name = 'NCCL' +version = '2.27.5' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://developer.nvidia.com/nccl' +description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective +communication primitives that are performance optimized for NVIDIA GPUs.""" + +toolchain = {'name': 'GCCcore', 'version': '13.3.0'} + +github_account = 'NVIDIA' +source_urls = [GITHUB_SOURCE] +sources = ['v%(version)s-1.tar.gz'] +checksums = ['e8a8972fc7f7517703510ef23608d41f6484db5331fca37827b4af3f66995344'] + +builddependencies = [('binutils', '2.42')] + +dependencies = [ + ('CUDA', '12.6.0', '', SYSTEM), + ('UCX-CUDA', '1.16.0', versionsuffix), +] + +# default CUDA compute capabilities to use (override via --cuda-compute-capabilities) +cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0'] + +moduleclass = 'lib' diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch new file mode 100644 index 00000000000..0f60a483e5a --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_disable-test_nan_assert.patch @@ -0,0 +1,57 @@ +Disable a test that has incomplete skip condition. +See https://github.com/pytorch/pytorch/pull/167971 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py +index 0a0f3ee4ca2..aff8ba0156f 100644 +--- a/test/distributed/test_c10d_nccl.py ++++ b/test/distributed/test_c10d_nccl.py +@@ -11,6 +11,7 @@ import sys + import tempfile + import threading + import time ++import unittest + import warnings + from contextlib import contextmanager + from datetime import datetime, timedelta +@@ -295,12 +296,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase): + # But if we are in Sandcastle, `skip_but_pass_in_sandcastle` would return 0. + TEST_NAN_ASSERT_RETURN = 0 if IS_SANDCASTLE else signal.SIGABRT + self.special_return_code_checks = { +- self.test_nan_assert_float16.__wrapped__: TEST_NAN_ASSERT_RETURN, +- self.test_nan_assert_float32.__wrapped__: TEST_NAN_ASSERT_RETURN, +- self.test_nan_assert_float64.__wrapped__: TEST_NAN_ASSERT_RETURN, +- self.test_nan_assert_bfloat16.__wrapped__: TEST_NAN_ASSERT_RETURN, +- self.test_nan_assert_float8_e4m3fn.__wrapped__: TEST_NAN_ASSERT_RETURN, +- self.test_nan_assert_float8_e5m2.__wrapped__: TEST_NAN_ASSERT_RETURN, ++ + } + + # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests +@@ -489,24 +485,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase): + torch.version.cuda is not None and int(torch.version.cuda.split(".")[0]) >= 12 + ) + +- @requires_nccl() +- @skip_but_pass_in_sandcastle_if( +- # skip for cu126 as well due to https://github.com/pytorch/pytorch/issues/153479 +- not (TEST_MULTIGPU and CUDA_12_AND_ABOVE), +- "NCCL test requires 2+ GPUs and Device side assert could cause unexpected errors in lower versions of CUDA", +- ) +- @parametrize( +- "type", +- [ +- torch.float16, +- torch.float32, +- torch.float64, +- torch.bfloat16, +- torch.float8_e4m3fn, +- torch.float8_e5m2, +- ], +- ) +- @skip_if_rocm_multiprocess ++ @unittest.skip("Wrong conditions") + def test_nan_assert(self, type): + # Expecting a device-side error when NaN is detected + os.environ["TORCH_NCCL_NAN_CHECK"] = "1" diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch new file mode 100644 index 00000000000..5c35b586ac8 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch @@ -0,0 +1,28 @@ +CudaGraphTreeTests.test_workspace_allocation_error fails if TORCH_DISABLE_ADDR2LINE=1 is set +> File "/pytorch-v2.9.0/test/inductor/test_cudagraph_trees.py", line 1568, in test_workspace_allocation_error +> self.assertTrue( +> AssertionError: False is not true + +See https://github.com/pytorch/pytorch/issues/103369 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py +--- a/test/inductor/test_cudagraph_trees.py ++++ b/test/inductor/test_cudagraph_trees.py +@@ -5,6 +5,7 @@ import functools + import gc + import importlib + import itertools ++import os + import re + import sys + import unittest +@@ -1543,6 +1544,7 @@ if HAS_CUDA_AND_TRITON: + @skipIfRocm + @unittest.skipUnless(IS_X86 and IS_LINUX, "cpp contexts are linux only") + @torch._inductor.config.patch("triton.cudagraph_trees_history_recording", True) ++ @unittest.mock.patch.dict(os.environ, {"TORCH_DISABLE_ADDR2LINE": "0"}) + def test_workspace_allocation_error(self): + torch._C._cuda_clearCublasWorkspaces() + diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch new file mode 100644 index 00000000000..0bf2d29a745 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch @@ -0,0 +1,28 @@ +Many tests using Float16 on CPU fail with reference_in_float=False +See https://github.com/pytorch/pytorch/issues/169809 + +E.g.: +> TestInductorOpInfoCPU.test_comprehensive_grid_sampler_2d_cpu_float16 +> [...] +> Mismatched elements: 125 / 780 (16.0%) +> Greatest absolute difference: 0.02001953125 at index (0, 1, 3, 2) (up to 1e-05 allowed) +> Greatest relative difference: 2.34375 at index (1, 1, 2, 4) (up to 0.001 allowed) + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py +index 807ccb48a79..7e5740e0177 100644 +--- a/test/inductor/test_torchinductor_opinfo.py ++++ b/test/inductor/test_torchinductor_opinfo.py +@@ -1329,8 +1329,10 @@ class TestInductorOpInfo(TestCase): + # Triton + if has_triton(): + adjusted_kwargs.update( +- copy_to_gpu=False, reference_in_float=False ++ copy_to_gpu=False, + ) ++ if device_type == GPU_TYPE: ++ adjusted_kwargs['reference_in_float'] = False + + # skip checking gradient on CPU for now + if device_type == GPU_TYPE: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch new file mode 100644 index 00000000000..851ac1f34bd --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-attention-squeeze.patch @@ -0,0 +1,59 @@ +From d55c9d52cda889850484968fc55ee73bf40540ec Mon Sep 17 00:00:00 2001 +From: Chien-Chin Huang +Date: Wed, 17 Sep 2025 18:14:51 -0700 +Subject: [PATCH] [CP] Fix cuDNN CP LSE dimension bug (#163231) + +We should only unsqueeze if necessary. + +Fix https://github.com/pytorch/pytorch/issues/162743 + +Pull Request resolved: https://github.com/pytorch/pytorch/pull/163231 +Approved by: https://github.com/eqy +ghstack dependencies: #162539, #162540, #162541, #163115, #163131 +--- + .../tensor/experimental/_attention.py | 18 +++++++++++++++--- + 1 file changed, 15 insertions(+), 3 deletions(-) + +diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py +index 6336967582429..a3345f37a170d 100644 +--- a/torch/distributed/tensor/experimental/_attention.py ++++ b/torch/distributed/tensor/experimental/_attention.py +@@ -134,6 +134,7 @@ def __init__(self, convert_to_f32: bool, seq_dim: int): + self._seq_dim = seq_dim + self._out: Optional[torch.Tensor] = None + self._lse: Optional[torch.Tensor] = None ++ self._should_lse_squeeze = False + self._convert_to_f32 = convert_to_f32 + self._out_dtype = torch.float32 + self._lse_dtype = torch.float32 +@@ -141,7 +142,14 @@ def __init__(self, convert_to_f32: bool, seq_dim: int): + def _merge_one( + self, block_out: torch.Tensor, block_lse: torch.Tensor, partial: bool + ) -> None: +- block_lse = block_lse.unsqueeze(dim=-1) ++ # The cuDNN backend preserves the last dimension for LSE. ++ # Apply unsqueeze only if the input does not already have ++ # the required dimensionality. ++ if len(block_lse.shape) < len(block_out.shape): ++ block_lse = block_lse.unsqueeze(dim=-1) ++ self._should_lse_squeeze = True ++ assert len(block_lse.shape) == len(block_out.shape) ++ + if self._lse is None: + self._lse = block_lse + self._out = block_out +@@ -199,8 +207,12 @@ def step(self, out: torch.Tensor, lse: torch.Tensor, partial: bool) -> None: + def results(self) -> tuple[torch.Tensor, torch.Tensor]: + assert self._out is not None + assert self._lse is not None +- out, lse = self._out, self._lse.squeeze(-1) +- return out.to(self._out_dtype), lse.to(self._lse_dtype) ++ out = self._out.to(self._out_dtype) ++ if self._should_lse_squeeze: ++ lse = self._lse.squeeze(-1).to(self._lse_dtype) ++ else: ++ lse = self._lse.to(self._lse_dtype) ++ return out, lse + + + class _AttentionOp(Protocol): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch new file mode 100644 index 00000000000..248d6d934b7 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-nccl-test-env.patch @@ -0,0 +1,55 @@ +From 6702f545d880fd82700811e4a3508cdd76da9a69 Mon Sep 17 00:00:00 2001 +From: Alexander Grund +Date: Tue, 16 Sep 2025 17:37:06 +0000 +Subject: [PATCH] Restore environment after NcclUserBufferRegistrationTest + (#163063) + +This test sets "NCCL_ALGO=NVLS" in NcclUserBufferRegistrationTest which affects tests run in the same process such as `test_on_completion_hook_*` that fail with +> invalid usage (run with NCCL_DEBUG=WARN for details), NCCL version 2.26.2 +> ncclInvalidUsage: This usually reflects invalid usage of NCCL library. +> Last error: +> Error : no algorithm/protocol available for function Broadcast with datatype ncclInt8. NCCL_ALGO was set to NVLS. + +Pull Request resolved: https://github.com/pytorch/pytorch/pull/163063 +Approved by: https://github.com/ezyang +--- + test/distributed/test_c10d_nccl.py | 21 +++++++++++++-------- + 1 file changed, 13 insertions(+), 8 deletions(-) + +diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py +index 0d55845228da..f44394e3148c 100644 +--- a/test/distributed/test_c10d_nccl.py ++++ b/test/distributed/test_c10d_nccl.py +@@ -3145,19 +3145,24 @@ def test_invalid_nccl_blocking_wait_env(self): + class NcclUserBufferRegistrationTest(MultiProcessTestCase): + def setUp(self): + super().setUp() +- # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests +- # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected. +- os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1" + nccl_debug_file = tempfile.NamedTemporaryFile() +- os.environ["NCCL_ALGO"] = "NVLS" +- os.environ["NCCL_DEBUG"] = "INFO" +- os.environ["NCCL_DEBUG_SUBSYS"] = "NVLS" ++ nccl_env = { ++ # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests ++ # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected. ++ "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", ++ "NCCL_ALGO": "NVLS", ++ "NCCL_DEBUG": "INFO", ++ "NCCL_DEBUG_SUBSYS": "NVLS", ++ "NCCL_DEBUG_FILE": nccl_debug_file.name, ++ } + if torch.cuda.nccl.version() >= (2, 24, 3): +- os.environ["NCCL_DEBUG_SUBSYS"] = "REG,TUNING" +- os.environ["NCCL_DEBUG_FILE"] = nccl_debug_file.name ++ nccl_env["NCCL_DEBUG_SUBSYS"] = "REG,TUNING" ++ self.env_patcher = mock.patch.dict(os.environ, nccl_env) ++ self.env_patcher.start() + self._spawn_processes() + + def tearDown(self): ++ self.env_patcher.stop() + super().tearDown() + try: + os.remove(self.file_name) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch new file mode 100644 index 00000000000..b74d565bc51 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_exclude_padding.patch @@ -0,0 +1,33 @@ +PadMMTest.test_exclude_padding fails on H100 with +> self.assertTrue(len(local_cache) == 2) +> AssertionError: False is not true + +Increasing the size triggers the intended code. +See https://github.com/pytorch/pytorch/pull/169177 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py +--- a/test/inductor/test_pad_mm.py ++++ b/test/inductor/test_pad_mm.py +@@ -425,7 +426,10 @@ class PadMMTest(TestCase): + def mm(a, b): + return a @ b + +- mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda")) ++ # Size must be big enough such that `is_mm_compute_bound` returns True and we need padding to 4 elements ++ # machine balance is ~8.3 (A100), 14.1 (H100), size must be 3x that, see arithmetic_intensity for M=N=K ++ size = [59, 59] ++ mm(torch.rand(size, device="cuda"), torch.rand(size, device="cuda")) + local_cache = get_pad_cache().get_local_cache() + self.assertTrue(len(local_cache) == 2) + FileCheck().check_count("exclude_pad:False", 2, exactly=True).run( +@@ -436,7 +440,7 @@ class PadMMTest(TestCase): + def mm(a, b): + return (a + 1) @ b + +- mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda")) ++ mm(torch.rand(size, device="cuda"), torch.rand(size, device="cuda")) + local_cache = get_pad_cache().get_local_cache() + # reuse original base timing + self.assertTrue(len(local_cache) == 3) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch new file mode 100644 index 00000000000..819b8577356 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_fix-test_version_error.patch @@ -0,0 +1,27 @@ +TestSaveLoad.test_version_error causes a failure due to TEMPDIR being set by EasyBuild: + +> Ran into the following error when deserializing: [enforce fail at inline_container.cc:332] . file in archive is not in a subdirectory tmpi40i4vmn/: easybuild-tmp/archive_version + +Fix the code to handle that, see https://github.com/pytorch/pytorch/pull/169936 + +diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py +index faef9b455a0..e3a463014fb 100644 +--- a/test/export/test_serialize.py ++++ b/test/export/test_serialize.py +@@ -7,6 +7,7 @@ with test_sym_bool) + import copy + import io + import math ++import os + import tempfile + import unittest + import zipfile +@@ -1915,7 +1916,7 @@ class TestSaveLoad(TestCase): + with tempfile.NamedTemporaryFile(suffix=".pt2") as f: + save(ep, f.name) + f.seek(0) +- file_prefix = f.name.split("/")[2].split(".")[0] ++ file_prefix = os.path.splitext(os.path.basename(f.name))[0] + + # Create a new file and copy things over, but modify the + # archive version diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch new file mode 100644 index 00000000000..e2a096dd8b9 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch @@ -0,0 +1,29 @@ +Avoid PyTorch trying to use $HOME if XDG_CACHE_HOME is set. +See https://github.com/pytorch/pytorch/pull/168232 + +Author: Alexander Grund (TU Dresden) + +diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp +--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp ++++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp +@@ -36,8 +36,18 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) { + if (writer_ == nullptr) { + // Attempt to write to running user's HOME directory cache folder - if it + // exists. +- auto homeDir = getCvarString({"HOME"}, "/tmp"); +- auto cacheDirPath = std::filesystem::path(homeDir + "/.cache/torch"); ++ #ifdef _WIN32 ++ const char* cacheHome = nullptr; ++ #else ++ // Uses XDG_CACHE_HOME if it's set ++ const char* cacheHome = std::getenv("XDG_CACHE_HOME"); ++ #endif ++ std::string cacheRoot; ++ if (cacheHome) ++ cacheRoot = cacheHome; ++ else ++ cacheRoot = getCvarString({"HOME"}, "/tmp") + "/.cache"; ++ auto cacheDirPath = std::filesystem::path(cacheRoot + "/torch"); + // Create the .cache directory if it doesn't exist + std::filesystem::create_directories(cacheDirPath); + auto defaultLocation = cacheDirPath / "comm_lib_trace_rank_"; diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch new file mode 100644 index 00000000000..76180cb4481 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch @@ -0,0 +1,21 @@ +When not using Intel MKL this shows a tolerance error in +TestSDPACpuOnlyCPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_17_n_head_1_head_dim_8_mask_dim_2_bool_mask_True_train_True_casual_False_set_attn_mask_True_cpu_float32 + +> self.assertEqual(grad_k_actual, grad_k_ref, atol=tol_grad.atol, rtol=tol_grad.rtol) +> Mismatched elements: 1 / 1632 (0.1%) +> Greatest absolute difference: 1.245737075805664e-05 at index (9, 0, 15, 1) (up to 1e-05 allowed) +> Greatest relative difference: 5.157565828994848e-05 at index (9, 0, 15, 1) (up to 5e-06 allowed) + +diff --git a/test/test_transformers.py b/test/test_transformers.py +index 5b240e1f046..2e1b4091d35 100644 +--- a/test/test_transformers.py ++++ b/test/test_transformers.py +@@ -2153,6 +2153,8 @@ class TestSDPACpuOnly(NNTestCase): + tol_grad = Tolerances(5e-2, 5e-2) + if dtype is torch.float16: + tol_grad = Tolerances(1e-1, 1e-1) ++ if dtype is torch.float32: ++ tol_grad = Tolerances(1.3e-5, 5.2e-5) + for mask_shape in itertools.product( + [q_seq_len, 1], [kv_seq_len, 1] + ) if mask_dim == 2 else itertools.product( diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_readd-support-for-nvidia-cutlass-python-package.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_readd-support-for-nvidia-cutlass-python-package.patch new file mode 100644 index 00000000000..0e2848280d1 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_readd-support-for-nvidia-cutlass-python-package.patch @@ -0,0 +1,124 @@ +Allow use of the NVIDIA CUTLASS Python package if installed. +See https://github.com/pytorch/pytorch/pull/160180 + +Author: Alexander Grund (TU Dresden) + +diff -ur a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py +--- a/torch/_inductor/codecache.py 2025-10-15 19:15:08.000000000 +0200 ++++ b/torch/_inductor/codecache.py 2025-10-24 18:07:49.519431015 +0200 +@@ -3628,13 +3628,15 @@ + return "nvcc" + + +-def _cutlass_path() -> str: ++def _cutlass_path() -> Optional[str]: + if config.is_fbcode(): + from libfb.py import parutil + + return parutil.get_dir_path("cutlass-4-headers") + else: +- return config.cuda.cutlass_dir ++ from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass ++ ++ return config.cuda.cutlass_dir if try_import_cutlass() else None + + + def _cutlass_paths() -> list[str]: +@@ -3649,6 +3651,8 @@ + def _clone_cutlass_paths(build_root: str) -> list[str]: + paths = _cutlass_paths() + cutlass_root = _cutlass_path() ++ if cutlass_root is None: ++ return [] + for path in _cutlass_paths(): + old_path = os.path.join(cutlass_root, path) + new_path = os.path.join(build_root, path) +@@ -3657,10 +3661,12 @@ + + + def _cutlass_include_paths() -> list[str]: +- cutlass_path = _cutlass_path() ++ cutlass_root = _cutlass_path() ++ if cutlass_root is None: ++ return [] + return [ + # Use realpath to get canonical absolute paths, in order not to mess up cache keys +- os.path.realpath(os.path.join(cutlass_path, path)) ++ os.path.realpath(os.path.join(cutlass_root, path)) + for path in _cutlass_paths() + ] + +diff -ur a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py +--- a/torch/_inductor/codegen/cuda/cutlass_utils.py 2025-10-15 19:15:08.000000000 +0200 ++++ b/torch/_inductor/codegen/cuda/cutlass_utils.py 2025-10-24 18:07:49.520431003 +0200 +@@ -1,6 +1,7 @@ + # mypy: allow-untyped-defs + import atexit + import functools ++import importlib.metadata + import logging + import os + import shutil +@@ -15,6 +16,7 @@ + import torch + from torch._inductor.runtime.runtime_utils import dynamo_timed + from torch._inductor.utils import clear_on_fresh_cache ++from torch._vendor.packaging.version import Version + from torch.utils._ordered_set import OrderedSet + + from ... import config +@@ -73,7 +75,9 @@ + """ + We want to support three ways of passing in CUTLASS: + 1. fbcode, handled by the internal build system. +- 2. User specifies cutlass_dir. The default is ../third_party/cutlass/, ++ 2. pip install nvidia-cutlass, which provides the cutlass_library package ++ and the header files in the cutlass_library/source directory. ++ 3. User specifies cutlass_dir. The default is ../third_party/cutlass/, + which is the directory when developers build from source. + """ + if config.is_fbcode(): +@@ -89,6 +93,34 @@ + + return True + ++ try: ++ cutlass_version = Version(importlib.metadata.version("cutlass")) ++ if cutlass_version < Version("3.7"): ++ log.warning("CUTLASS version < 3.7 is not recommended.") ++ ++ import cutlass_library # type: ignore[import-not-found] # noqa: F811 ++ ++ log.debug( ++ "Found cutlass_library in python search path, overriding config.cuda.cutlass_dir" ++ ) ++ cutlass_library_dir = os.path.dirname(cutlass_library.__file__) ++ assert os.path.isdir(cutlass_library_dir), ( ++ f"{cutlass_library_dir} is not a directory" ++ ) ++ config.cuda.cutlass_dir = os.path.abspath( ++ os.path.join( ++ cutlass_library_dir, ++ "source", ++ ) ++ ) ++ ++ return True ++ except (ModuleNotFoundError, importlib.metadata.PackageNotFoundError): ++ log.debug( ++ "cutlass_library not found in sys.path, trying to import from config.cuda.cutlass_dir", ++ exc_info=True, ++ ) ++ + # Copy CUTLASS python scripts to a temp dir and add the temp dir to Python search path. + # This is a temporary hack to avoid CUTLASS module naming conflicts. + # TODO(ipiszy): remove this hack when CUTLASS solves Python scripts packaging structure issues. +@@ -156,7 +188,7 @@ + ) + + try: +- import cutlass # noqa: F401, F811 ++ import cutlass # noqa: F401 + import cutlass_library.generator # noqa: F401 + import cutlass_library.library # noqa: F401 + import cutlass_library.manifest # noqa: F401 diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_remove-faulty-close.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_remove-faulty-close.patch new file mode 100644 index 00000000000..0eeea901157 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_remove-faulty-close.patch @@ -0,0 +1,48 @@ +commit d3d62ad44284abff4fcd0c70e245739c976bf5e1 +Author: Alexander Grund +Date: Tue Nov 25 13:54:26 2025 +0100 + + Avoid closing random file handles in Inductor + + `CppCodeCache.load` returns a `ctypes.CDLL`. + That does not have a (Python class) `close` function so calling + `self.DLL.close()` calls whatever C function with name `close` happens + to exist. This is usually the glibc `close` that closes (file) handles. + As the argument is missing it closes whatever happens to be in the + register at that point. + + In some tests this seems to close "fd=1", i.e. stdout. Sebsequent + writes/print then fails with + > OSError: [Errno 9] Bad file descriptor + + Simply remove the `close` call for now. + +diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py +index 1d1687141fb..66b741fafe2 100644 +--- a/torch/_inductor/autotune_process.py ++++ b/torch/_inductor/autotune_process.py +@@ -882,14 +882,6 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest): + *self.extra_args, + ) + +- def cleanup_run_fn(self) -> None: +- if self.DLL is not None: +- """ +- Check close attr due to it crash on Windows. +- """ +- if hasattr(self.DLL, "close"): +- self.DLL.close() +- + def __str__(self) -> str: + return f"{self.kernel_name=}" + +@@ -939,9 +931,6 @@ class CuteDSLBenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest): + + return run_kernel + +- def cleanup_run_fn(self) -> None: +- """Clean up any resources used by the kernel.""" +- + + @functools.cache + def get_tuning_process_pool() -> TuningProcessPool: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_revert-pybind11-3-change.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_revert-pybind11-3-change.patch new file mode 100644 index 00000000000..1b831f45fa5 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_revert-pybind11-3-change.patch @@ -0,0 +1,68 @@ +Revert https://github.com/pytorch/pytorch/pull/161063 + +The PR introduced changes required for the pybind11 3.x API which makes it incompatible with pybind11 2.x + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_cpu_cpp_wrapper.py b/test/inductor/test_cpu_cpp_wrapper.py +index 47a8f3aa063..4b4daaef5c4 100644 +--- a/test/inductor/test_cpu_cpp_wrapper.py ++++ b/test/inductor/test_cpu_cpp_wrapper.py +@@ -268,7 +268,7 @@ if RUN_CPU: + "test_multi_threading", + condition=not IS_WINDOWS, + # Two threads compile, so we expect the output code to be printed twice. +- code_string_count={"py::gil_scoped_release_simple release;": 2}, ++ code_string_count={"py::gil_scoped_release release;": 2}, + ), + BaseTest("test_profiler_mark_wrapper_call"), + BaseTest( +diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py +index 83d1d061467..77f9c368ed3 100644 +--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py ++++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py +@@ -585,7 +585,7 @@ class CppWrapperCpu(PythonWrapperCodegen): + # Weights are promoted in the JIT mode + num_args = len(V.graph.graph_inputs) + len(V.graph.constants) + # release GIL to support multiple instances inference (in different threads of the same process) +- self.prefix.splice("py::gil_scoped_release_simple release;") ++ self.prefix.splice("py::gil_scoped_release release;") + + self.prefix.splice( + f""" +@@ -2310,7 +2310,7 @@ class CppWrapperCpu(PythonWrapperCodegen): + + scoped_lines.writeline("{") + with scoped_lines.indent(): +- scoped_lines.writeline("py::gil_scoped_acquire_simple acquire;") ++ scoped_lines.writeline("py::gil_scoped_acquire acquire;") + scoped_lines.writelines(lines_in_scope.split("\n")) + scoped_lines.writelines("}") + return scoped_lines._lines +diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py +index 63c5bc2debe..fd145ece606 100644 +--- a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py ++++ b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py +@@ -297,7 +297,7 @@ class CppWrapperCpuArrayRef(CppWrapperCpu): + # Weights are promoted in the JIT mode + num_args = len(V.graph.graph_inputs) + len(V.graph.constants) + # release GIL to support multiple instances inference (in different threads of the same process) +- self.prefix.splice("py::gil_scoped_release_simple release;") ++ self.prefix.splice("py::gil_scoped_release release;") + + self.prefix.splice( + f""" +diff --git a/torch/csrc/inductor/cpp_wrapper/common.h b/torch/csrc/inductor/cpp_wrapper/common.h +index a2eebfcc860..9d9ae16462c 100644 +--- a/torch/csrc/inductor/cpp_wrapper/common.h ++++ b/torch/csrc/inductor/cpp_wrapper/common.h +@@ -6,7 +6,8 @@ + #include + + #include +-#include ++#define PYBIND11_SIMPLE_GIL_MANAGEMENT ++#include + + // Include some often-used cpp_wrapper headers, for precompiling. + #include diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch new file mode 100644 index 00000000000..b0a55ad4912 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch @@ -0,0 +1,23 @@ +inductor/test_benchmark_fusion.py BenchmarkingTest.test_benchmark_on_non_zero_device fails with +> self.assertTrue(hit_count > 0) +> AssertionError: False is not true + +Related: https://github.com/pytorch/pytorch/issues/160514 + +Author: Alexander Grund (TU Dresden) + +--- a/test/inductor/test_benchmark_fusion.py ++++ b/test/inductor/test_benchmark_fusion.py +@@ -206,10 +206,7 @@ if HAS_CUDA_AND_TRITON: + copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionCudaTest, "cuda") + + class BenchmarkingTest(TestCase): +- @unittest.skipIf( +- torch.cuda.device_count() < 2, "The test need at least 2 devices" +- ) +- @skip_if_cpp_wrapper("This tests triton scheduling directly") ++ @unittest.skip("Mocking fails") + def test_benchmark_on_non_zero_device(self): + hit_count = 0 + with torch.cuda.device("cuda:0"): + diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_convolution1-on-H100.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_convolution1-on-H100.patch new file mode 100644 index 00000000000..e0c0a45b341 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_convolution1-on-H100.patch @@ -0,0 +1,30 @@ +test_select_algorithm.py TestSelectAlgorithm.test_convolution1 fails on H100 with: + +> Mismatched elements: 19584 / 23120 (84.7%) +> Greatest absolute difference: 132.32015991210938 at index (0, 22, 4, 13) (up to 0.0001 allowed) +> Greatest relative difference: inf at index (0, 0, 1, 0) (up to 0.0001 allowed) + +See https://github.com/pytorch/pytorch/issues/143412 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py +index b30cdc2d946..25d3c068133 100644 +--- a/test/inductor/test_select_algorithm.py ++++ b/test/inductor/test_select_algorithm.py +@@ -27,6 +27,7 @@ from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm, skipIfXpu + from torch.testing._internal.inductor_utils import ( + GPU_TYPE, + HAS_GPU, ++ IS_H100, + requires_gpu, + requires_triton, + ) +@@ -295,6 +296,7 @@ class TestSelectAlgorithm(TestCase): + foo(torch.randn(64, 64, device=GPU_TYPE)) + self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1) + ++ @unittest.skipIf(IS_H100, "Fails on H100, see #143412") + @expectedFailureDynamicWrapper + @patches + def test_convolution1(self): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch new file mode 100644 index 00000000000..fe992ece4f5 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch @@ -0,0 +1,19 @@ +The test fails with +> RuntimeError: Expected to find "buf0 = torch.ops._c10d_functional.all_gather_into_tensor_coalesced.default([arg3_1, arg2_1, arg1_1, arg0_1]" but did not find it + +Also upstream: https://github.com/pytorch/pytorch/issues/146806 + +Author: Alexander Grund (TU Dresden) +diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py +index bafc781b591..60fc47f63e4 100644 +--- a/test/distributed/test_c10d_functional_native.py ++++ b/test/distributed/test_c10d_functional_native.py +@@ -997,7 +997,7 @@ class CompileTest(TestCase): + AOTIRunnerUtil.run(func, (arg,)) + torch.cuda.synchronize() + +- @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") ++ @unittest.skip("Fails") + @fresh_cache() + def test_inductor_all_gather_into_tensor_coalesced(self): + def func(args: list[torch.Tensor]) -> torch.Tensor: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch new file mode 100644 index 00000000000..88d176f6051 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch @@ -0,0 +1,19 @@ +Skip test_pad_mm.py PadMMTest.test_original_aten_preserved_pad_mm failing on: +> File "/dev/shm/pytorch-v2.9.1/test/inductor/test_pad_mm.py", line 538, in test_original_aten_preserved_pad_mm +> self.assertEqual(counters["inductor"]["pattern_matcher_count"], 1) + +See https://github.com/pytorch/pytorch/issues/170562 + +Author: Alexander Grund (TU Dresden) +diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py +index 781f4588e14..b6f0fcebb3c 100644 +--- a/test/inductor/test_pad_mm.py ++++ b/test/inductor/test_pad_mm.py +@@ -508,6 +508,7 @@ class PadMMTest(TestCase): + + assert torch.allclose(res2, mm_expected_result), "MM results are not identical" + ++ @unittest.skip("Fails") + @fresh_cache() + @inductor_config.patch( + { diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_override-without-CUDA.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_override-without-CUDA.patch new file mode 100644 index 00000000000..bc2b927e0a0 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_override-without-CUDA.patch @@ -0,0 +1,35 @@ +This test fails during creation of the tests at startup: +> File "/var/lib/jenkins/workspace/test/test_overrides.py", line 683, in _simple_type_parser +> return torch.Stream() +> RuntimeError: CUDA error: CUDA driver version is insufficient for CUDA runtime version + +See https://github.com/pytorch/pytorch/pull/166625 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_overrides.py b/test/test_overrides.py +index 8454677856d..8df233e279f 100644 +--- a/test/test_overrides.py ++++ b/test/test_overrides.py +@@ -9,9 +9,9 @@ import pprint + import pickle + import collections + import unittest +-import os ++import contextlib + +-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_CROSSREF ++from torch.testing._internal.common_utils import TestCase, run_tests, TEST_CUDA, TEST_WITH_CROSSREF + from torch.overrides import ( + handle_torch_function, + has_torch_function, +@@ -30,8 +30,7 @@ from torch.utils._pytree import tree_map + + Tensor = torch.Tensor + +-if os.getenv("ATEN_CPU_CAPABILITY") in ("default", "avx2"): +- # This test is not supported on ARM ++if not TEST_CUDA: + print( + "Skipping due to failing when cuda build runs on non cuda machine, " + + "see https://github.com/pytorch/pytorch/pull/150059 for example" diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_unbacked_reduction.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_unbacked_reduction.patch new file mode 100644 index 00000000000..bfb54615bf5 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-test_unbacked_reduction.patch @@ -0,0 +1,18 @@ +TestInductorDynamicCPU.test_unbacked_reduction_cpu doesn't only fail on ROCM with: +> AssertionError: expected to fail, but actually passed + + +See https://github.com/pytorch/pytorch/issues/154217 + +Author: Alexander Grund (TU Dresden) + +--- a/test/inductor/test_torchinductor_dynamic_shapes.py ++++ b/test/inductor/test_torchinductor_dynamic_shapes.py +@@ -513,6 +513,7 @@ class TestInductorDynamic(TestCase): + ).sum().backward() + self.assertEqual(t.grad, expect) + ++ @unittest.skip("Fails on CPU") + @torch._dynamo.config.patch(capture_scalar_outputs=True) + def test_unbacked_reduction(self, device): + expect_fail = ( diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch new file mode 100644 index 00000000000..a4aadc780df --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch @@ -0,0 +1,122 @@ +These tests use Triton to generate PTX code and then compile that with NVCC. + +As Triton 3.5 uses PTXAS from CUDA 12.8 it cannot be compiled with NVCC from CUDA 12.6. + +Failures look like: +> ptxas /tmp/torchinductor_s3248973/bvqcnu2o7/2mwinejhnbvqcnu2o73mk3zrx6.ptx, line 5; fatal : Unsupported .version 8.7; current version is '8.5' + +in following tests: +- test_simple_multi_arch +- test_compile_after_package_multi_arch +- test_compile_after_package_static +- test_compile_standalone_cos +- test_compile_with_exporter +- test_compile_with_exporter_weights + +See https://github.com/pytorch/pytorch/issues/168353 + +Author: Alexander Grund (TU Dresden) + +--- a/test/inductor/test_aot_inductor.py ++++ b/test/inductor/test_aot_inductor.py +@@ -39,7 +39,7 @@ from torch.export.pt2_archive._package import load_pt2 + from torch.testing import FileCheck + from torch.testing._internal import common_utils + from torch.testing._internal.common_cuda import ( +- _get_torch_cuda_version, ++ requires_triton_ptxas_compat, + PLATFORM_SUPPORTS_FLASH_ATTENTION, + PLATFORM_SUPPORTS_FP8, + PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, +@@ -239,9 +239,7 @@ class AOTInductorTestsTemplate: + # Skip embed_kernel_binary == True for now as it shows random + # failure on CI + @common_utils.parametrize("embed_kernel_binary", [False]) +- @unittest.skipIf( +- _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+" +- ) ++ @requires_triton_ptxas_compat + def test_simple_multi_arch(self, embed_kernel_binary): + if self.device != GPU_TYPE: + raise unittest.SkipTest("requires GPU_TYPE") +diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py +index 0eb1057c802..843f63ff17d 100644 +--- a/test/inductor/test_aot_inductor_package.py ++++ b/test/inductor/test_aot_inductor_package.py +@@ -27,7 +27,7 @@ from torch.export.pt2_archive._package import ( + load_pt2, + load_weights_to_pt2_contents, + ) +-from torch.testing._internal.common_cuda import _get_torch_cuda_version ++from torch.testing._internal.common_cuda import _get_torch_cuda_version, requires_triton_ptxas_compat + from torch.testing._internal.common_utils import ( + IS_FBCODE, + skipIfRocm, +@@ -319,9 +319,7 @@ class TestAOTInductorPackage(TestCase): + actual = optimized(*example_inputs) + self.assertTrue(torch.allclose(actual, expected)) + +- @unittest.skipIf( +- _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+" +- ) ++ @requires_triton_ptxas_compat + @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode") + @skipIfRocm # doesn't support multi-arch binary + @skipIfXpu # doesn't support multi-arch binary +@@ -366,9 +364,7 @@ class TestAOTInductorPackage(TestCase): + actual = optimized(*example_inputs) + self.assertTrue(torch.allclose(actual, expected)) + +- @unittest.skipIf( +- _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+" +- ) ++ @requires_triton_ptxas_compat + @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode") + @skipIfXpu # build system may be different + @torch._inductor.config.patch("test_configs.use_libtorch", True) +@@ -429,6 +425,7 @@ class TestAOTInductorPackage(TestCase): + self.cmake_compile(model, example_inputs, options, "") + + @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode") ++ @requires_triton_ptxas_compat + @skipIfXpu # build system may be different + @torch._inductor.config.patch("test_configs.use_libtorch", True) + def test_compile_standalone_cos(self): +@@ -461,9 +458,7 @@ class TestAOTInductorPackage(TestCase): + a_path = build_path / "libcos.a" + self.assertTrue(a_path.exists()) + +- @unittest.skipIf( +- _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+" +- ) ++ @requires_triton_ptxas_compat + @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode") + @skipIfRocm # doesn't support multi-arch binary + @skipIfXpu # doesn't support multi-arch binary +@@ -519,9 +514,7 @@ class TestAOTInductorPackage(TestCase): + " 0 0 0\n 0 0 0\n[ CPUFloatType{3,3} ]\n", + ) + +- @unittest.skipIf( +- _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+" +- ) ++ @requires_triton_ptxas_compat + @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode") + @skipIfRocm # doesn't support multi-arch binary + @skipIfXpu # doesn't support multi-arch binary +diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py +index be284429114..3bd0e0a904f 100644 +--- a/torch/testing/_internal/common_cuda.py ++++ b/torch/testing/_internal/common_cuda.py +@@ -373,6 +373,11 @@ def xfailIfSM120OrLater(func): + def xfailIfDistributedNotSupported(func): + return func if not (IS_MACOS or IS_JETSON) else unittest.expectedFailure(func) + ++# When using nvcc from the CUDA toolkit its versuib must be at least the one from ptxas bundled with Triton ++TRITON_PTXAS_VERSION = (12, 8) ++requires_triton_ptxas_compat = unittest.skipIf(torch.version.hip is None and _get_torch_cuda_version() < TRITON_PTXAS_VERSION, ++ "Requires CUDA 12.8 to match Tritons ptxas version") ++ + # Importing this module should NOT eagerly initialize CUDA + if not CUDA_ALREADY_INITIALIZED_ON_IMPORT: + assert not torch.cuda.is_initialized() diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch new file mode 100644 index 00000000000..3667657cc17 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch @@ -0,0 +1,104 @@ +Unexpected success in e.g. TestExportOpInfoCPU.test_fake_export___getitem___cpu_float32 + +Same with PYPI package and reported in https://github.com/pytorch/pytorch/pull/164166 + +Skip all instead of XFailing + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/export/test_export_opinfo.py b/test/export/test_export_opinfo.py +index 35d8b2895bd..409a305a3aa 100644 +--- a/test/export/test_export_opinfo.py ++++ b/test/export/test_export_opinfo.py +@@ -22,54 +22,54 @@ from torch.utils import _pytree as pytree + + # following are failing with regular torch.export.export + export_failures = { +- xfail("allclose"), +- xfail("combinations"), +- xfail("corrcoef"), +- xfail("cov"), +- xfail("equal"), +- xfail("linalg.lstsq"), +- xfail("linalg.lstsq", "grad_oriented"), +- xfail("nn.functional.ctc_loss"), +- xfail("nn.functional.gaussian_nll_loss"), +- xfail("sparse.sampled_addmm"), +- xfail("tensor_split"), ++ skip("allclose"), ++ skip("combinations"), ++ skip("corrcoef"), ++ skip("cov"), ++ skip("equal"), ++ skip("linalg.lstsq"), ++ skip("linalg.lstsq", "grad_oriented"), ++ skip("nn.functional.ctc_loss"), ++ skip("nn.functional.gaussian_nll_loss"), ++ skip("sparse.sampled_addmm"), ++ skip("tensor_split"), + } + + # following are failing fake export on cuda device + fake_export_failures = { +- xfail("geqrf"), +- xfail("histogram"), +- xfail("masked.amax"), +- xfail("masked.amin"), +- xfail("masked.argmax"), +- xfail("masked.argmin"), +- xfail("masked.logaddexp"), +- xfail("masked.logsumexp"), +- xfail("masked.mean"), +- xfail("masked.prod"), +- xfail("masked.std"), +- xfail("masked.sum"), +- xfail("masked.var"), +- xfail("nn.functional.grid_sample"), +- xfail("to_sparse"), ++ skip("geqrf"), ++ skip("histogram"), ++ skip("masked.amax"), ++ skip("masked.amin"), ++ skip("masked.argmax"), ++ skip("masked.argmin"), ++ skip("masked.logaddexp"), ++ skip("masked.logsumexp"), ++ skip("masked.mean"), ++ skip("masked.prod"), ++ skip("masked.std"), ++ skip("masked.sum"), ++ skip("masked.var"), ++ skip("nn.functional.grid_sample"), ++ skip("to_sparse"), + # cannot xfail as it is passing for cpu-only build + skip("nn.functional.conv2d"), + skip("nn.functional.scaled_dot_product_attention"), + # following are failing due to OptionalDeviceGuard +- xfail("__getitem__"), +- xfail("nn.functional.batch_norm"), +- xfail("nn.functional.instance_norm"), +- xfail("nn.functional.multi_margin_loss"), +- xfail("nonzero"), ++ skip("__getitem__"), ++ skip("nn.functional.batch_norm"), ++ skip("nn.functional.instance_norm"), ++ skip("nn.functional.multi_margin_loss"), ++ skip("nonzero"), + } + + fake_decomposition_failures = { +- xfail("linalg.matrix_rank"), +- xfail("nn.functional.binary_cross_entropy_with_logits"), +- xfail("nn.functional.instance_norm"), +- xfail("nn.functional.multi_margin_loss"), +- xfail("repeat_interleave"), +- xfail("take"), ++ skip("linalg.matrix_rank"), ++ skip("nn.functional.binary_cross_entropy_with_logits"), ++ skip("nn.functional.instance_norm"), ++ skip("nn.functional.multi_margin_loss"), ++ skip("repeat_interleave"), ++ skip("take"), + } + + diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch new file mode 100644 index 00000000000..4050e2e148c --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch @@ -0,0 +1,49 @@ +Revert part of https://github.com/pytorch/pytorch/pull/158905 + +We use Z3 4.13.0 which has different output in this case compared to 4.13.1+. +This causes failures in ExcTests.test_trigger_on_error & ExcTests.test_trigger_bisect_on_error + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/dynamo/test_exc.py b/test/dynamo/test_exc.py +--- a/test/dynamo/test_exc.py ++++ b/test/dynamo/test_exc.py +@@ -253,13 +253,13 @@ translation validation failed. + + Model: + ==> L['shape'][0]: 0 +- ==> L['shape'][1]: 0 +- ==> L['shape'][2]: 0 ++ ==> L['shape'][1]: 1 ++ ==> L['shape'][2]: 1 + ==> L['x'].size()[0]: 3 + ==> L['x'].storage_offset(): 0 + ==> L['x'].stride()[0]: 1 +- ==> s3: 0 +- ==> s52: 0 ++ ==> s3: 1 ++ ==> s52: 1 + ==> s77: 3 + ==> s86: 0 + +@@ -317,16 +317,16 @@ Failure occurred while running node: + %split : [num_users=3] = call_method[target=split](args = (%l_x_, (%l_shape_0_, %l_shape_1_, %l_shape_2_)), kwargs = {}) + + Model: +- ==> L['shape'][0]: 0 +- ==> L['shape'][1]: 0 ++ ==> L['shape'][0]: 1 ++ ==> L['shape'][1]: 1 + ==> L['shape'][2]: 0 + ==> L['x'].size()[0]: 3 + ==> L['x'].storage_offset(): 0 + ==> L['x'].stride()[0]: 1 + ==> s3: 0 +- ==> s52: 0 ++ ==> s52: 1 + ==> s77: 3 +- ==> s86: 0 ++ ==> s86: 1 + + Assertions: + ==> (== 0 L['x'].storage_offset()) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a-CUDA-12.6.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a-CUDA-12.6.0.eb new file mode 100644 index 00000000000..bf1333f3221 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a-CUDA-12.6.0.eb @@ -0,0 +1,279 @@ +name = 'PyTorch' +version = '2.9.1' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://pytorch.org/' +description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration. +PyTorch is a deep learning framework that puts Python first.""" + +toolchain = {'name': 'foss', 'version': '2024a'} + +local_six_version = '1.11.0' +# This is specific to a (tagged) release. +# Extract from `get_disabled_tests` in tools/stats/import_test_stats.py +local_disabled_tests_S3_ID = 'UsscdNP.2GMOzUxAvqIx8GAj4MuhX1Xi' +source_urls = [GITHUB_RELEASE] +sources = [ + '%(namelower)s-v%(version)s.tar.gz', + { + 'filename': '%(name)s-%(version)s-disabled-tests.json', + 'download_filename': f'disabled-tests-condensed.json?versionId={local_disabled_tests_S3_ID}', + 'source_urls': ['https://ossci-metrics.s3.amazonaws.com'], + # See `DEFAULT_DISABLED_TESTS_FILE` in torch/testing/_internal/common_utils.py + 'extract_cmd': 'cp %s %(builddir)s/pytorch-v%(version)s/test/.pytorch-disabled-tests.json', + }, + { + # Avoid downloading this during the build, see third_party/NNPACK/cmake/DownloadSix.cmake for the version + 'filename': f'six-{local_six_version}.tar.gz', + 'source_urls': ['https://pypi.python.org/packages/source/s/six'], + } +] +patches = [ + 'PyTorch-1.12.1_add-hypothesis-suppression.patch', + 'PyTorch-1.7.0_disable-dev-shm-test.patch', + 'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch', + 'PyTorch-2.1.0_remove-test-requiring-online-access.patch', + 'PyTorch-2.6.0_fix-server-in-test_control_plane.patch', + 'PyTorch-2.6.0_show-test-duration.patch', + 'PyTorch-2.6.0_skip-test_segfault.patch', + 'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch', + 'PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch', + 'PyTorch-2.7.1_skip-test_data_parallel_rnn.patch', + 'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch', + 'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch', + 'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch', + 'PyTorch-2.7.1_suport-64bit-BARs.patch', + 'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch', + 'PyTorch-2.9.0_disable-test_nan_assert.patch', + 'PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch', + 'PyTorch-2.9.0_fix-attention-squeeze.patch', + 'PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch', + 'PyTorch-2.9.0_fix-nccl-test-env.patch', + 'PyTorch-2.9.0_fix-test_exclude_padding.patch', + 'PyTorch-2.9.0_fix-test_version_error.patch', + 'PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch', + 'PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch', + 'PyTorch-2.9.0_remove-faulty-close.patch', + 'PyTorch-2.9.0_revert-pybind11-3-change.patch', + 'PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch', + 'PyTorch-2.9.0_skip-test_convolution1-on-H100.patch', + 'PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch', + 'PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch', + 'PyTorch-2.9.0_skip-test_override-without-CUDA.patch', + 'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch', + 'PyTorch-2.9.0_skip-test_unbacked_reduction.patch', + 'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch', + 'PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch', + 'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch', + 'PyTorch-2.9.1_check-device-avail-test_schedule.patch', + 'PyTorch-2.9.1_disable-slow-tests.patch', + 'PyTorch-2.9.1_fix-hypothesis-deadline.patch', + 'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch', + 'PyTorch-2.9.1_fix-test_dist2-decorators.patch', + 'PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch', + 'PyTorch-2.9.1_GCC14-ARM-workaround.patch', + 'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch', + 'PyTorch-2.9.1_normalize_tree_output.patch', + 'PyTorch-2.9.1_set-test-timeout.patch', + 'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch', + 'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch', + 'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch', + 'PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch', + 'PyTorch-2.9.1_skip-tests-requiring-SM90.patch', +] +checksums = [ + {'pytorch-v2.9.1.tar.gz': 'e17504700ebc4c87f9b57059df1c4d790b769458c04db144c7a92aea90f2c92b'}, + {'PyTorch-2.9.1-disabled-tests.json': '471f8aa36e056173d09ffd421ead45539a8d35fec6e61a8a0050d92a5fcd9f04'}, + {'six-1.11.0.tar.gz': '70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9'}, + {'PyTorch-1.12.1_add-hypothesis-suppression.patch': + 'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'}, + {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'}, + {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch': + '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'}, + {'PyTorch-2.1.0_remove-test-requiring-online-access.patch': + '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'}, + {'PyTorch-2.6.0_fix-server-in-test_control_plane.patch': + '1337689ff28ecaa8d1d0edf60d322bcdd7846fec040925325d357b19eb6e4342'}, + {'PyTorch-2.6.0_show-test-duration.patch': '5508f2f9619204d9f3c356dbd4000a00d58f452ab2d64ae920eb8bc8b5484d75'}, + {'PyTorch-2.6.0_skip-test_segfault.patch': '26806bd62e6b61b56ebaa52d68ca44c415a28124f684bd2fb373557ada68ef52'}, + {'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch': + '2f3255e067f5c6f0d78b4fbce94784c41bddf3d01bab9673856b0d0bbc4e3fec'}, + {'PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch': + 'aaf22cb431357dc78e4db895d64febf1c7ee187e8ad27bd13544d011127354d4'}, + {'PyTorch-2.7.1_skip-test_data_parallel_rnn.patch': + 'aa85b678e89db4bb41d2c5f4990f0d05959be92e61918291cb5609685b7f1841'}, + {'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch': + '503030c3591196510a3c2d95db30b28a0b396adb8b50ff0d221f6bdb1f939935'}, + {'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch': + '709288abc802c9eb687c15f2677ebaf408d8325a4cb470d23cb72447ee0b8e13'}, + {'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch': + '4916a256b2b9914e4fdb930681b80df93ea561ddee2fc9978c4973a5650be5e9'}, + {'PyTorch-2.7.1_suport-64bit-BARs.patch': '317c3d220aa87426d86e137a6c1a8f910adf9580ca0848371e0f6800c05dbde1'}, + {'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch': + 'f304440a57e00b8052a5ffbf285adad8d0fdc5a812a659420b59a20deb5a9942'}, + {'PyTorch-2.9.0_disable-test_nan_assert.patch': '98e9f98ce8fb89ae368739bc039be69040ed446a1c74ee5c2a1ef8ba60986c7d'}, + {'PyTorch-2.9.0_enable-symbolizer-in-test_workspace_allocation_error.patch': + 'ba4032b967c0393c916a26fb2b117ba40670ae8e809cb34399a6379b4e523d72'}, + {'PyTorch-2.9.0_fix-attention-squeeze.patch': '8f040e74780cab391bb4c84f86390a13230e1a309ddf65db9900d9a1c66e1288'}, + {'PyTorch-2.9.0_fix-FP16-CPU-tests-in-test_torchinductor_opinfo.patch': + 'b696d7be8c55ff1ccf8731dccf119b8792cd9593eaff457f37e76114e52346d2'}, + {'PyTorch-2.9.0_fix-nccl-test-env.patch': '9326223c400262788734ec608f6134c5d240f4d5315a8d294179a28f885d6845'}, + {'PyTorch-2.9.0_fix-test_exclude_padding.patch': + '349850874fb75d57a24437d871a4994a773e501632ce66a2adca613380a152dc'}, + {'PyTorch-2.9.0_fix-test_version_error.patch': 'b10bb10d0a353e4ba7dbef28ca5fef03a8ba552896e1982708aa90ab6f24f34f'}, + {'PyTorch-2.9.0_honor-XDG_CACHE_HOME.patch': '239631258431174e4aed8947ae6096e003a3213bfbfa112cd0cdebae89469164'}, + {'PyTorch-2.9.0_increase-tolerance-in-test_transformers.patch': + 'c27ab34900835c2a15edc26d481343a16433bfa52f635a80cbab252c1320a545'}, + {'PyTorch-2.9.0_remove-faulty-close.patch': '32ca744d68dcfa669e46ced9d2776af3dcc380dd9c3458ba7c1c432e5c5295b3'}, + {'PyTorch-2.9.0_revert-pybind11-3-change.patch': + '5289894011fefc67482b1e19c9d1c502e94a943fc7a2d5ed5a6a1eaf444570a0'}, + {'PyTorch-2.9.0_skip-test_benchmark_on_non_zero_device.patch': + '85e236431d1a5da3fb7fccc2554640898c29f5fab46a41d15b3ab61dd1f924fc'}, + {'PyTorch-2.9.0_skip-test_convolution1-on-H100.patch': + '704750c7cc08b58779907d608cd4b7505043e394fb27530b16d72a0dc27c277e'}, + {'PyTorch-2.9.0_skip-test_inductor_all_gather_into_tensor_coalesced.patch': + '644153d4c1d8267c0631df2902a6dfe8ec2a197f3374f2a2f5654e6bd0edc05e'}, + {'PyTorch-2.9.0_skip-test_original_aten_preserved_pad_mm.patch': + 'ac9e05d296cd5ff938a44662cd022efcc8133c744ca82b045c6a15bc64f67cf4'}, + {'PyTorch-2.9.0_skip-test_override-without-CUDA.patch': + '967512d1487bf1ad06982cc5b976c0b38ba062c3f3473cb4542c4b9ac0740662'}, + {'PyTorch-2.9.0_skip-tests-requiring-CUDA-12.8.patch': + '6d79aff5291627b86d8fea025bf2379e4065c7d9cbef5cf83452c35922848728'}, + {'PyTorch-2.9.0_skip-test_unbacked_reduction.patch': + 'b51dd5d7c9cfeed946cbc5c7fc22f2e78e1fa52dda55569b957c20ca4ed01fe8'}, + {'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch': + '2e73f71ea0f09e613cc4a108893e7948b6daf239e3fe42fd2d3ae5d43c3cf9de'}, + {'PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch': + '5c68e0de73212ed266879f4528a6041ef7ab2f1ac83c6cf7142c4baa78e7664c'}, + {'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch': + '86ce380e69b3b20e010d817889cb1b825b05b4054a045b00f2ac12161b77d7e4'}, + {'PyTorch-2.9.1_check-device-avail-test_schedule.patch': + '64c28d38ce69147565509add36d310473ce46f14a0a876d38b5049cb7fce9817'}, + {'PyTorch-2.9.1_disable-slow-tests.patch': '76e6d8f7366b91a0ddc65f73685f2b09988bb5537d10d294f9bb6a48c7fec3d0'}, + {'PyTorch-2.9.1_fix-hypothesis-deadline.patch': 'f7a130669eee9924a303df9e2bd5743ff023a7d994b7a3e43c86dcccf0206c49'}, + {'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch': + 'ab408275ec66e836112a50054acc4e789ef38196efeb6137c6061d60d9ac9ead'}, + {'PyTorch-2.9.1_fix-test_dist2-decorators.patch': + 'bf4ed805f00775ed33351de7bce40ebf4eac16aff6c61d2e91790982bc43d73b'}, + {'PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch': + 'bdddf5a9ba47d57ec96f4bbefc3b85c4904e44de93dc5c7a65bc03e343035ae9'}, + {'PyTorch-2.9.1_GCC14-ARM-workaround.patch': 'ea8a8662e20fae2fb3a74c7f8bf390aba80a598ab37f9131c720d25ebb14965d'}, + {'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch': + 'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'}, + {'PyTorch-2.9.1_normalize_tree_output.patch': '7d5994580339b73c28de595d9e5a0448db97b7d284f17efd18909e4613d170df'}, + {'PyTorch-2.9.1_set-test-timeout.patch': '15fa1149c250b1333b0bc491f659aaf89d5d6eaf6df5ebc81eea545478c1239c'}, + {'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch': + 'd8489c192da549083569e09e5f94d2a83c9e41e111b1322f86512a9c5a58c0d9'}, + {'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch': + 'e544f765beac7bdb3fc0ada98a3f92fd7e511ed8874de085aa2f213cca769d40'}, + {'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch': + '3cf0b11136fb18c45072687eafd3024d91b504d231a4fa40e04bc62d8d6019c7'}, + {'PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch': + 'e57486cc42f3dbcae29753168febc251d070a283229e2d76ccbdf19fee53f06e'}, + {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch': + '7db02152db2ae70c0fd4c4602fe381e26a74b8e4f7b16b1a3554b2353d761b10'}, +] + +osdependencies = [OS_PKG_IBVERBS_DEV] + +builddependencies = [ + ('CMake', '3.29.3'), + ('hypothesis', '6.103.1'), + ('setuptools', '80.9.0'), + # For tests + ('parameterized', '0.9.0'), + ('pytest-flakefinder', '1.1.0'), + ('pytest-rerunfailures', '15.0'), + ('pytest-shard', '0.1.2'), + ('pytest-subtests', '0.13.1'), + ('tlparse', '0.4.0'), + ('optree', '0.14.1'), + ('unittest-xml-reporting', '3.1.0'), +] + +dependencies = [ + ('CUDA', '12.6.0', '', SYSTEM), + # PyTorch is very sensitive to the NCCL & cuDNN versions. (Maybe the same for cuSPARSELt) + # Prefer those (listed per CUDA version) in + # https://github.com/pytorch/pytorch/blob/main/.github/scripts/generate_binary_build_matrix.py + # or https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_cuda.sh + ('NCCL', '2.27.5', versionsuffix), + ('cuDNN', '9.10.2.21', versionsuffix, SYSTEM), + ('magma', '2.9.0', versionsuffix), + ('cuSPARSELt', '0.6.3.2', versionsuffix, SYSTEM), + # Version from .ci/docker/triton_version.txt + ('Triton', '3.5.0', versionsuffix), + ('Ninja', '1.12.1'), # Required for JIT compilation of C++ extensions + ('Python', '3.12.3'), + ('Python-bundle-PyPI', '2024.06'), + ('expecttest', '0.2.1'), + ('GMP', '6.3.0'), + ('MPFR', '4.2.1'), + ('networkx', '3.4.2'), + ('numactl', '2.0.18'), + ('Pillow', '10.4.0'), + ('protobuf-python', '5.28.0'), + ('protobuf', '28.0'), + ('pybind11', '2.12.0'), + ('PuLP', '2.8.0'), + ('PyYAML', '6.0.2'), + ('pyzstd', '0.16.2'), + ('SciPy-bundle', '2024.05'), + ('sympy', '1.13.3'), + ('Z3', '4.13.0',), +] + +prebuildopts = (f"""sed -i '1i set(PYTHON_SIX_SOURCE_DIR "%(builddir)s/six-{local_six_version}")' """ + "cmake/Dependencies.cmake && ") +buildcmd = '%(python)s setup.py build' # Run the (long) build in the build step + +excluded_tests = { + '': [ + # This test seems to take too long on NVIDIA Ampere at least. + 'distributed/test_distributed_spawn', + # no xdoctest + 'doctests', + # intermittent failures on various systems + # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712 + 'distributed/rpc/test_tensorpipe_agent', + # This test is expected to fail when run in their CI, but won't in our case. + # It just checks for a "CI" env variable + 'test_ci_sanity_check_fail', + # Requires pwlf Python package + 'distributed/_tools/test_sac_ilp', 'distributed/_tools/test_sac_estimator', + # 9 failures in H100, 7 are present in PYPI package, 2 are related to GC in Python < 3.12.4 + 'dynamo/test_dynamic_shapes', + # Broken test: https://github.com/pytorch/pytorch/issues/162179 + 'distributed/_composable/fsdp/test_fully_shard_logging', + # Broken: https://github.com/pytorch/pytorch/issues/137027 + 'inductor/test_extension_backend', + # Requires optional Python packages + 'test_public_bindings', + # 1 Failure and not important + 'dynamo/test_utils', + # Packaging test only, not important for us + 'test_license', + ] +} + +runtest = ( + # Disable symbol resolution in stack traces that can cause hangs and slowdowns + ' TORCH_DISABLE_ADDR2LINE=1' + ' TORCHINDUCTOR_CUTLASS_DIR=%(start_dir)s/third_party/cutlass' + ' PYTEST_ADDOPTS=--full-trace' + ' PYTHONUNBUFFERED=1' + ' %(python)s test/run_test.py' + ' --continue-through-error --pipe-logs --verbose' + ' %(excluded_tests)s' +) + +postinstallcmds = [ + "mkdir %(installdir)s/extra", + "cp -r third_party/cutlass %(installdir)s/extra/", +] + +modextrapaths = {'TORCHINDUCTOR_CUTLASS_DIR': 'extra/cutlass'} + +tests = ['PyTorch-check-cpp-extension.py', 'PyTorch-check-cutlass.py'] + +moduleclass = 'ai' diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_GCC14-ARM-workaround.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_GCC14-ARM-workaround.patch new file mode 100644 index 00000000000..e0504c90d06 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_GCC14-ARM-workaround.patch @@ -0,0 +1,53 @@ +From 8fd509399e25cb4b265dff663d3f777406001f2e Mon Sep 17 00:00:00 2001 +From: Nikita Shulga <2453524+malfet@users.noreply.github.com> +Date: Tue, 10 Feb 2026 04:35:39 +0000 +Subject: [PATCH] Blunter GCC 14.2.0 workaround for SVE compilation (#174647) + +Updated preprocessor directive for GCC version check and removed BF16 condition. I.e. right now SVE256 compilation with gcc-14.2 on Debian13 for ` -march=armv8-a+sve+bf16` + +Without the fix, compilation fails with +``` +In file included from /home/dev/git/pytorch/pytorch/build/aten/src/ATen/native/cpu/Unfold2d.cpp.SVE256.cpp:1: +/home/dev/git/pytorch/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp: In function 'void at::native::{anonymous}::unfolded2d_acc_kernel(c10::ScalarType, void*, void*, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, bool)': +/home/dev/git/pytorch/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp:225:1: error: unrecognizable insn: + 225 | } + | ^ +(insn 1371 1370 1372 101 (set (reg:VNx16BI 3235) + (unspec:VNx16BI [ + (reg:VNx16BI 3232) + (reg:VNx8BI 3234) + (const_vector:VNx4BI [ + (const_int 0 [0]) repeated x8 + ]) + ] UNSPEC_TRN1_CONV)) "/home/dev/git/pytorch/pytorch/torch/headeronly/util/bit_cast.h":40:14 -1 + (nil)) +during RTL pass: vregs +/home/dev/git/pytorch/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp:225:1: internal compiler error: in extract_insn, at recog.cc:2812 +``` + +Not sure what compelled me to put such a narrow restriction in https://github.com/pytorch/pytorch/pull/157867 + +Fixes https://github.com/pytorch/pytorch/issues/172630 + +Pull Request resolved: https://github.com/pytorch/pytorch/pull/174647 +Approved by: https://github.com/seemethere +--- + aten/src/ATen/native/cpu/Unfold2d.cpp | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp +index ed69998e99f79..9ae1391e2603e 100644 +--- a/aten/src/ATen/native/cpu/Unfold2d.cpp ++++ b/aten/src/ATen/native/cpu/Unfold2d.cpp +@@ -169,8 +169,9 @@ void unfolded2d_acc_channels_last( + + /* note: due to write issues, this one cannot be parallelized as well as + * unfolded2d_copy */ +-#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16) +-// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16 ++#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) ++// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE ++// NS: With or without BF16, see https://github.com/pytorch/pytorch/issues/172630 + __attribute__((optimize("no-tree-vectorize"))) + #endif + void unfolded2d_acc_kernel( diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch new file mode 100644 index 00000000000..75e8fa00ca0 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch @@ -0,0 +1,57 @@ +A crashed child process in a test might cause the parent to never complete. +Use a timeout to avoid that. +See https://github.com/pytorch/pytorch/pull/171972 + +Author: Alexander Grund (TU Dresden) + +diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py +index c1f75697fe8..47661c7a1fa 100644 +--- a/torch/testing/_internal/common_distributed.py ++++ b/torch/testing/_internal/common_distributed.py +@@ -621,6 +621,33 @@ def cleanup_temp_dir() -> None: + tmp_dir.cleanup() + + ++def retrieve_result_from_process_queue( ++ process: torch.multiprocessing.Process, ++ completion_queue: torch.multiprocessing.Queue, ++ timeout: Optional[int] = None, ++) -> Any: ++ """Get result from queue associated with process. ++ ++ When the process finished without putting a result or the timeout expired an exception instance will be returned""" ++ queue_timeout = 120 if timeout is None else max(10, min(120, timeout // 4)) ++ start_time = time.time() ++ # Periodically check the process for liveness ++ while True: ++ try: ++ return completion_queue.get(timeout=queue_timeout) ++ except queue.Empty: ++ # If not alive do a last check because the timeout might have happened just before completion ++ if not process.is_alive() and completion_queue.empty(): ++ # Clean up process to avoid keeping a zombie process ++ process.terminate() # Just to be sure ++ process.join(600) # Usually completes immediately ++ return RuntimeError(f"Exited with {process.exitcode}") ++ if timeout is not None: ++ elapsed = time.time() - start_time ++ if elapsed > timeout: ++ return RuntimeError(f"Process timeout out after {elapsed}s") ++ ++ + # Most tests operate with this worldsize + DEFAULT_WORLD_SIZE = 4 + +@@ -1786,8 +1813,10 @@ class MultiProcContinuousTest(TestCase): + if self.rank == self.MAIN_PROCESS_RANK: + logger.debug(f"Waiting for workers to finish {self.id()}") # noqa: G004 + # Wait for the workers to finish the test +- for i, completion_queue in enumerate(self.completion_queues): +- rv = completion_queue.get() ++ for i, (p, completion_queue) in enumerate( ++ zip(self.processes, self.completion_queues) ++ ): ++ rv = retrieve_result_from_process_queue(p, completion_queue) + if isinstance(rv, BaseException): + # Hit an exception, re-raise it in the main process. + logger.warning( diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_check-device-avail-test_schedule.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_check-device-avail-test_schedule.patch new file mode 100644 index 00000000000..202d1e4a1fc --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_check-device-avail-test_schedule.patch @@ -0,0 +1,19 @@ +Some tests fail if no accelerator is available. +> RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU [...] + +Check for availability to trigger CPU fallback. + +Author: Alexander Grund (TU Dresden) +diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py +index dabf3d78a6f..d3b8bf13168 100644 +--- a/test/distributed/pipelining/test_schedule.py ++++ b/test/distributed/pipelining/test_schedule.py +@@ -53,7 +53,7 @@ from torch.testing._internal.distributed.fake_pg import FakeStore + + ARTIFACTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "artifacts") + +-device = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu" ++device = acc.type if (acc := torch.accelerator.current_accelerator(check_available=True)) else "cpu" + logger = logging.getLogger(__name__) + torch.manual_seed(0) + diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch new file mode 100644 index 00000000000..8f6d6e0c767 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch @@ -0,0 +1,40 @@ +On CI it defaults to importing JSON files with slow and disabled tests. +Those are then skipped upon execution. + +Enable the default for non-CI environments to cut down testing time. +Don't check for SANDCASTLE when determining whether to skip disabled tests. +However, the disabled-tests JSON file needs to be downloaded from S3 and placed at "tests/.pytorch-disabled-tests.json". + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/run_test.py b/test/run_test.py +index 44a15d4ab2c..269d4206f3e 100755 +--- a/test/run_test.py ++++ b/test/run_test.py +@@ -531,7 +531,7 @@ def run_test( + + # NB: These features are not available for C++ tests, but there is little incentive + # to implement it because we have never seen a flaky C++ test before. +- if IS_CI and not is_cpp_test: ++ if not is_cpp_test: + ci_args = ["--import-slow-tests", "--import-disabled-tests"] + if RERUN_DISABLED_TESTS: + ci_args.append("--rerun-disabled-tests") +diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py +index bfc568bc146..7ef37cccccb 100644 +--- a/torch/testing/_internal/common_utils.py ++++ b/torch/testing/_internal/common_utils.py +@@ -2722,11 +2722,11 @@ def check_if_enable(test: unittest.TestCase): + if not TEST_WITH_SLOW: + raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test") + +- if not IS_SANDCASTLE: ++ if True: + should_skip = False + skip_msg = "" + +- for disabled_test, (issue_url, platforms) in disabled_tests_dict.items(): ++ for disabled_test, (pr_num, issue_url, platforms) in disabled_tests_dict.items(): + if matches_test(disabled_test): + platform_to_conditional: dict = { + "mac": IS_MACOS, diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch new file mode 100644 index 00000000000..ebdfb00e0a3 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-TestExportOpInfoCPU-with-single-GPU.patch @@ -0,0 +1,20 @@ +Fixes a failure on systems with a single GPU. +Error in `init_gpu_context` (fake_tensor.py:744): +> E torch.AcceleratorError: CUDA error: invalid device ordinal + +See: https://github.com/pytorch/pytorch/pull/164184 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/export/test_export_opinfo.py b/test/export/test_export_opinfo.py +--- a/test/export/test_export_opinfo.py ++++ b/test/export/test_export_opinfo.py +@@ -79,7 +79,7 @@ def _test_export_helper(self, dtype, op): + mode = FakeTensorMode(allow_non_fake_inputs=True) + converter = mode.fake_tensor_converter + # intentionally avoid cuda:0 to flush out some bugs +- target_device = "cuda:1" ++ target_device = "cuda:0" + + def to_fake_device(x): + x = converter.from_real_tensor(mode, x) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-hypothesis-deadline.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-hypothesis-deadline.patch new file mode 100644 index 00000000000..c526ea336c1 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-hypothesis-deadline.patch @@ -0,0 +1,67 @@ +The assertion at the bottom sometimes fails. + +From c4312b443fed1fd8e0e28dfe049ce61226936e99 Mon Sep 17 00:00:00 2001 +From: FFFrog +Date: Thu, 25 Sep 2025 16:32:19 +0800 +Subject: [PATCH] [Tools] Adapting the Hypothesis library (version 5.x) for use + with the PyTorch framework (#163748) + +Starting from version 5.x, the Hypothesis library removed the timeout setting and only retained the deadline. +Pull Request resolved: https://github.com/pytorch/pytorch/pull/163748 +Approved by: https://github.com/albanD, https://github.com/Skylion007 +--- + torch/testing/_internal/hypothesis_utils.py | 24 +++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +diff --git a/torch/testing/_internal/hypothesis_utils.py b/torch/testing/_internal/hypothesis_utils.py +index f02ef4c9e04b0..a00e1e1a048a0 100644 +--- a/torch/testing/_internal/hypothesis_utils.py ++++ b/torch/testing/_internal/hypothesis_utils.py +@@ -7,6 +7,7 @@ + + import hypothesis + from functools import reduce ++from importlib.metadata import version + from hypothesis import assume + from hypothesis import settings + from hypothesis import strategies as st +@@ -346,22 +347,33 @@ def tensor_conv( + + return X, W, b, groups, tr + ++ + # We set the deadline in the currently loaded profile. + # Creating (and loading) a separate profile overrides any settings the user + # already specified. +-hypothesis_version = hypothesis.version.__version_info__ +-current_settings = settings._profiles[settings._current_profile].__dict__ +-current_settings['deadline'] = None +-if hypothesis_version >= (3, 16, 0) and hypothesis_version < (5, 0, 0): +- current_settings['timeout'] = hypothesis.unlimited ++hypothesis_version = tuple(map(int, version("hypothesis").split(".")[:3])) ++ ++if (3, 16, 0) <= hypothesis_version < (3, 27, 0): ++ # Hypothesis 3.16 → 3.26: use `timeout` instead of `deadline` ++ settings.register_profile("no_deadline", timeout=hypothesis.unlimited) ++else: ++ # Hypothesis >=3.27: use `deadline=None` ++ settings.register_profile("no_deadline", deadline=None) ++ ++# Activate the profile ++settings.load_profile("no_deadline") ++ ++ + def assert_deadline_disabled(): ++ """Check that deadlines are effectively disabled across Hypothesis versions.""" + if hypothesis_version < (3, 27, 0): + import warnings ++ + warning_message = ( + "Your version of hypothesis is outdated. " + "To avoid `DeadlineExceeded` errors, please update. " + f"Current hypothesis version: {hypothesis.__version__}" + ) +- warnings.warn(warning_message) ++ warnings.warn(warning_message, stacklevel=2) + else: + assert settings().deadline is None diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch new file mode 100644 index 00000000000..3ff313cbe12 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch @@ -0,0 +1,17 @@ +Avoid an error caused by modifying dict while iterating it. + +Author: Alexander Grund (TU Dresden) + +diff --git a/tools/flight_recorder/components/types.py b/tools/flight_recorder/components/types.py +index 20e093688ba..98192aeb92c 100644 +--- a/tools/flight_recorder/components/types.py ++++ b/tools/flight_recorder/components/types.py +@@ -164,7 +164,7 @@ class Database(NamedTuple): + # TODO: We need to add a schema for the following + types = [ + TypeInfo.from_type(t) # type: ignore[type-var] +- for t in globals().values() ++ for t in list(globals().values()) + if ( + isinstance(t, type) + and issubclass(t, tuple) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch new file mode 100644 index 00000000000..fffd633b451 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_dist2-decorators.patch @@ -0,0 +1,62 @@ +The requires_gloo/requires_nccl decorator cause the function to just return. +In the way they are used this skips the initialization done by a helper function. +So the test is not skipped and then fails due to missing variables. + +Decorate the class instead. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/test_dist2.py b/test/distributed/test_dist2.py +index b335eff1c21..ff5a1e8c028 100644 +--- a/test/distributed/test_dist2.py ++++ b/test/distributed/test_dist2.py +@@ -256,10 +256,10 @@ class Dist2MultiProcessTestCase(MultiProcessTestCase): + self.assertEqual(merged_pg.group_name, "merged_pg") + + ++@requires_gloo() + class ProcessGroupGlooTest(Dist2MultiProcessTestCase): + device = torch.device("cpu") + +- @requires_gloo() + def new_group(self) -> torch.distributed.ProcessGroup: + os.environ["RANK"] = str(self.rank) + os.environ["WORLD_SIZE"] = str(self.world_size) +@@ -273,8 +273,8 @@ class ProcessGroupGlooTest(Dist2MultiProcessTestCase): + ) + + ++@requires_nccl() + class ProcessGroupNCCLTest(Dist2MultiProcessTestCase): +- @requires_nccl() + @skip_if_lt_x_gpu(2) + def new_group(self) -> torch.distributed.ProcessGroup: + os.environ["RANK"] = str(self.rank) +diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py +index c1f75697fe8..d513510d955 100644 +--- a/torch/testing/_internal/common_distributed.py ++++ b/torch/testing/_internal/common_distributed.py +@@ -331,10 +331,7 @@ def with_dist_debug_levels(levels): + + + def requires_gloo(): +- return skip_but_pass_in_sandcastle_if( +- not c10d.is_gloo_available(), +- "c10d was not compiled with the Gloo backend", +- ) ++ return unittest.skipUnless(c10d.is_gloo_available(), "c10d was not compiled with the Gloo backend") + + + def requires_nccl_version(version, msg): +@@ -361,10 +358,7 @@ def requires_nccl_version(version, msg): + + + def requires_nccl(): +- return skip_but_pass_in_sandcastle_if( +- not c10d.is_nccl_available(), +- "c10d was not compiled with the NCCL backend", +- ) ++ return unittest.skipUnless(c10d.is_nccl_available(), "c10d was not compiled with the NCCL backend") + + + def requires_ucc(): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch new file mode 100644 index 00000000000..cebc1478b59 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch @@ -0,0 +1,22 @@ +Silence a warning that fails builds with GCC 14, especially in XNNPACK. +See https://github.com/pytorch/pytorch/pull/166873 + +Applied more broadly as we don't care about warnings anyway. + +Author: Alexander Grund (TU Dresden) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -852,6 +852,11 @@ if(MSVC) + append_cxx_flag_if_supported("/utf-8" CMAKE_CXX_FLAGS) + endif() + ++if(CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "14") ++ string(APPEND CMAKE_C_FLAGS " -Wno-incompatible-pointer-types") ++endif() ++ ++ + # Note for ROCM platform: 1. USE_ROCM is always ON until + # include(cmake/Dependencies.cmake) 2. USE_CUDA will become OFF during + # re-configuration Truth Table: CUDA 1st pass: USE_CUDA=True;USE_ROCM=True, diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_normalize_tree_output.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_normalize_tree_output.patch new file mode 100644 index 00000000000..4c708a216cb --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_normalize_tree_output.patch @@ -0,0 +1,24 @@ +Avoid failure in TestProfilerTree.test_profiler_experimental_tree_with_stack_and_modules +with diff: +> - +> + + +See https://github.com/pytorch/pytorch/pull/174768 + +Author: Alexander Grund (TU Dresden) +diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py +index 670e639c98e..e53fd93b273 100644 +--- a/test/profiler/test_profiler_tree.py ++++ b/test/profiler/test_profiler_tree.py +@@ -240,6 +240,11 @@ class TestProfilerTree(TestCase): + # simply coerce them into a platform independent form. If you made a + # change in the codebase which changes the trace produced, simply use + # EXPECTTEST_ACCEPT=1 to update the tests to reflect the new structure. ++ def normalize(tree): ++ return re.sub(r'of pybind11\w+ object at', 'of PyCapsule object at', tree) ++ ++ actual = normalize(actual) ++ expected = normalize(expected) + + # expecttest will not show the diff view if `len(actual) < len(expected)` + if not expecttest.ACCEPT: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_set-test-timeout.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_set-test-timeout.patch new file mode 100644 index 00000000000..6bfff62d3d1 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_set-test-timeout.patch @@ -0,0 +1,19 @@ +Some tests might hang forever and the default timeout will only be set when +a) --enable-timeout is passed, and +b) a `.additional_ci_files/test-times.json` exists at the root + +Manually set a timeout of 120min which should be enough for any single test. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/run_test.py b/test/run_test.py +--- a/test/run_test.py ++++ b/test/run_test.py +@@ -604,6 +604,7 @@ def run_test( + if is_cpp_test + else None + ) ++ timeout = 60 * 120 + print_to_stderr(f"Executing {command} ... [{datetime.now()}]") + + with ExitStack() as stack: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-RingFlexAttentionTest.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-RingFlexAttentionTest.patch new file mode 100644 index 00000000000..7855d55ddaf --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-RingFlexAttentionTest.patch @@ -0,0 +1,23 @@ +test_ring_flex_attention and test_ring_flex_attention_mask both fail in similar ways: + +> torch._dynamo.exc.Unsupported: Attempted to call function marked as skipped +> ... +> Developer debug context: module: _warnings, qualname: warn, skip reason: + +See https://github.com/pytorch/pytorch/pull/161667#issuecomment-3298676991 + & https://github.com/pytorch/pytorch/issues/162843 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py +index a2543d443e4..a28fb45e992 100644 +--- a/test/distributed/tensor/test_attention.py ++++ b/test/distributed/tensor/test_attention.py +@@ -531,6 +531,7 @@ def generate_doc_mask_mod( + return doc_mask_mod + + ++@unittest.skip("FAILS") + class RingFlexAttentionTest(DTensorTestBase): + @property + def world_size(self) -> int: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch new file mode 100644 index 00000000000..5e26591c68c --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch @@ -0,0 +1,17 @@ +This test shows segfaults, at least on some system. +PyTorch CI HUD indicates some failures with it are known. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py +index 740faa0b375..ea5e311b7cd 100644 +--- a/test/inductor/test_flex_attention.py ++++ b/test/inductor/test_flex_attention.py +@@ -3474,6 +3474,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1): + ) + FileCheck().check("BLOCK_M : tl.constexpr = 16").run(code[0]) + ++ @unittest.skip("Segfaults on CPU") + @supported_platform + def test_block_mask_non_divisible(self, device): + seq = torch.arange(1023, device=device) // 128 diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch new file mode 100644 index 00000000000..a6ec831fb1c --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch @@ -0,0 +1,97 @@ +FlexAttention is only supported on AVX2 CPUs. +However the tests are run on CPU unconditionally when CUDA devices are available leading to: +> torch._inductor.exc.InductorError: LoweringException: NotImplementedError: torch.compile on current platform is not supported for CPU. + +Add a condition to possibly only add CUDA tests. +See https://github.com/pytorch/pytorch/pull/174881 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py +index 740faa0b375..e698939d326 100644 +--- a/test/inductor/test_flex_attention.py ++++ b/test/inductor/test_flex_attention.py +@@ -48,6 +48,9 @@ from torch.testing._internal.common_device_type import ( + dtypesIfXPU, + flex_attention_supported_platform as supported_platform, + instantiate_device_type_tests, ++ IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED as TEST_ON_CPU, ++ IS_FLEX_ATTENTION_CUDA_PLATFORM_SUPPORTED as TEST_ON_CUDA, ++ IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED as TEST_ON_XPU, + largeTensorTest, + skipCPUIf, + skipCUDAIf, +@@ -177,25 +180,21 @@ class DeviceConfig: + dtypes_fast: list[torch.dtype] + + +-TEST_ON_CUDA = ( +- torch.cuda.is_available() +- and torch.utils._triton.has_triton() +- and torch.cuda.get_device_capability() >= (8, 0) +-) +-TEST_ON_XPU = torch.xpu.is_available() and torch.utils._triton.has_triton() +- + device_configs = {} ++# Tests are skipped when no device is supported, so CPU as default is safe ++test_device = ("cpu",) + if HAS_GPU: + if TEST_ON_CUDA: +- test_device = ( +- "cuda", +- "cpu", +- ) ++ if TEST_ON_CPU: ++ test_device = ( ++ "cuda", ++ "cpu", ++ ) ++ else: ++ test_device = ("cuda",) + elif TEST_ON_XPU: + torch._C._set_onednn_allow_tf32(True) + test_device = ("xpu",) +-else: +- test_device = ("cpu",) + + + class SubstringSet: +diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py +index 8971eca1bb2..6b14f9db105 100644 +--- a/torch/testing/_internal/common_device_type.py ++++ b/torch/testing/_internal/common_device_type.py +@@ -1972,23 +1972,25 @@ def get_all_device_types() -> list[str]: + + # skip since currently flex attention requires at least `avx2` support on CPU. + IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED = ( +- not torch.xpu.is_available() +- and not torch.cuda.is_available() +- and not IS_MACOS ++ not IS_MACOS + and torch.cpu._is_avx2_supported() + and os.getenv("ATEN_CPU_CAPABILITY") != "default" + ) + IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED = ( + torch.xpu.is_available() and torch.utils._triton.has_triton() + ) ++IS_FLEX_ATTENTION_CUDA_PLATFORM_SUPPORTED = ( ++ torch.cuda.is_available() ++ and torch.utils._triton.has_triton() ++ and torch.cuda.get_device_capability() >= (8, 0) ++) + flex_attention_supported_platform = unittest.skipUnless( + IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED +- or IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED +- or ( +- torch.cuda.is_available() +- and torch.utils._triton.has_triton() +- and torch.cuda.get_device_capability() >= (8, 0) +- ), ++ or (IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED ++ and not torch.xpu.is_available() ++ and not torch.cuda.is_available() ++ ) ++ or IS_FLEX_ATTENTION_CUDA_PLATFORM_SUPPORTED, + "Requires CUDA and Triton, Intel GPU and triton, or CPU with avx2 and later", + ) + if torch.version.hip and "gfx94" in torch.cuda.get_device_properties(0).gcnArchName: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch new file mode 100644 index 00000000000..3c5dd5523dc --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch @@ -0,0 +1,12 @@ +diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py +index 8c650f6b0ce..04cfa7d4cc2 100644 +--- a/test/distributed/tensor/test_dtensor_ops.py ++++ b/test/distributed/tensor/test_dtensor_ops.py +@@ -463,6 +463,7 @@ dtensor_fails = { + skip("nn.functional.feature_alpha_dropout", "without_train"), + skip("nn.functional.hinge_embedding_loss"), + skip("nn.functional.cosine_embedding_loss"), ++ skip("nn.functional.multi_head_attention_forward"), # randomness + skip("fft.hfft"), + skip("fft.hfft2"), + skip("fft.hfft2"), diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-SM90.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-SM90.patch new file mode 100644 index 00000000000..4dea63b7e5f --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-SM90.patch @@ -0,0 +1,85 @@ +Avoid test_intra_node_comm_all_reduce failing on e.g. A100: + +> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721] RuntimeError: CUDA error: device-side assert triggered... +> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721] exiting process 1 with exit code: 10 +> ... +> :318: st_vec: block: [0,0,0], thread: [87,0,0] Assertion `false` failed. +> /pytorch-v2.7.1/torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h:318: st_vec: block: [0,0,0], thread: [88,0,0] Assertion `false` failed. + +test_fused_all_gather_scaled_matmul fails with a NCCL error due to FP8 usage and hangs forever. +See https://github.com/pytorch/pytorch/issues/171796 + +test_fused_scaled_matmul_reduce_scatter fails with +> RuntimeError: torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+ + + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py +index 0a0f3ee4ca2..07702566fd8 100644 +--- a/test/distributed/test_c10d_nccl.py ++++ b/test/distributed/test_c10d_nccl.py +@@ -3350,7 +3350,7 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase): + @runOnRocmArch(MI300_ARCH) + def test_intra_node_comm_all_reduce(self): + from torch._C._distributed_c10d import _get_intra_node_comm_usage_counter +- from torch.testing._internal.common_cuda import SM80OrLater ++ from torch.testing._internal.common_cuda import SM90OrLater + + for peer in range(self.world_size): + if peer == self.rank: +@@ -3358,8 +3358,8 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase): + if not torch._C._cuda_canDeviceAccessPeer(self.rank, peer): + raise SkipTest("Test requires p2p access") + +- if not SM80OrLater: +- raise SkipTest("Test requires sm>=80") ++ if not SM90OrLater: ++ raise SkipTest("Test requires sm>=90") + + store = c10d.FileStore(self.file_name, self.world_size) + os.environ["ENABLE_INTRA_NODE_COMM"] = "1" +diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py +index eeeb24bec30..9d55b620840 100644 +--- a/test/distributed/test_symmetric_memory.py ++++ b/test/distributed/test_symmetric_memory.py +@@ -4,7 +4,7 @@ import itertools + import os + import random + from contextlib import nullcontext +-from unittest import skip, skipIf ++from unittest import skip, skipIf, skipUnless + + import torch + import torch.distributed as dist +@@ -22,7 +22,7 @@ from torch.distributed._symmetric_memory import ( + restride_A_for_fused_matmul_reduce_scatter, + restride_A_shard_for_fused_all_gather_matmul, + ) +-from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater ++from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater, IS_SM89 + from torch.testing._internal.common_device_type import e4m3_type + from torch.testing._internal.common_distributed import ( + MultiProcContinuousTest, +@@ -399,6 +399,10 @@ class AsyncTPTest(MultiProcContinuousTest): + + @runOnRocmArch(MI300_ARCH) + @skip_if_lt_x_gpu(2) ++ @skipIf( ++ not SM90OrLater, ++ "_fused_all_gather_scaled_matmul_fallback w/ FP8 only supports sm>=90", ++ ) + @parametrize("gather_dim", [0, 1]) + @parametrize( + "scale_mode", ["tensor-wise", "row-wise-replicated", "row-wise-sharded"] +@@ -512,6 +516,10 @@ class AsyncTPTest(MultiProcContinuousTest): + + @skipIfRocm # AsyncTP support changed _fused_scaled_matmul_reduce_scatter_fallback API, need more changes + @skip_if_lt_x_gpu(2) ++ @skipUnless( ++ SM90OrLater or IS_SM89, ++ "torch._scaled_mm (from fused_scaled_matmul_reduce_scatter) only supports sm>=90 or 8.9", ++ ) + @parametrize("scatter_dim", [0, 1]) + @parametrize("rowwise", [True, False]) + def test_fused_scaled_matmul_reduce_scatter( diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-check-cutlass.py b/easybuild/easyconfigs/p/PyTorch/PyTorch-check-cutlass.py new file mode 100755 index 00000000000..73d9951b78a --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-check-cutlass.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python + +# Verify that PyTorch can load CUTLASS, required for the CUTLASS inductor backend +# Author: Alexander Grund (TU Dresden) + +import os +import tempfile +from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass, config + +# Isolate from default path used +os.environ['TORCHINDUCTOR_CACHE_DIR'] = tempfile.mkdtemp(suffix='inductor_cache') +# Use empty working directory +os.chdir(tempfile.mkdtemp(suffix='cwd')) + + +if try_import_cutlass(): + print(f"CUTLASS is set up using {config.cuda.cutlass_dir}") +else: + raise RuntimeError("CUTLASS is NOT working")