diff --git a/easybuild/easyconfigs/c/cuDNN/cuDNN-9.5.1.17-CUDA-12.6.0.eb b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.5.1.17-CUDA-12.6.0.eb new file mode 100644 index 00000000000..77463d0096b --- /dev/null +++ b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.5.1.17-CUDA-12.6.0.eb @@ -0,0 +1,34 @@ +name = 'cuDNN' +version = '9.5.1.17' +versionsuffix = '-CUDA-%(cudaver)s' +homepage = 'https://developer.nvidia.com/cudnn' +description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is +a GPU-accelerated library of primitives for deep neural networks.""" + +toolchain = SYSTEM + +source_urls = [ + 'https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-%(cudnnarch)s/' +] +# note: cuDNN is tied to specific to CUDA versions, +# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions +sources = ['%(namelower)s-linux-%(cudnnarch)s-%(version)s_cuda%(cudamajver)s-archive.tar.xz'] +checksums = [{ + '%(namelower)s-linux-sbsa-%(version)s_cuda%(cudamajver)s-archive.tar.xz': + '340c49b32c133b0321c5c5b00d14fb64887dcac83ee8fd24195d9191061f1ad7', + '%(namelower)s-linux-x86_64-%(version)s_cuda%(cudamajver)s-archive.tar.xz': + '35dd20b9c68324ae1288ac36f66ab1f318d2bfecfafb703a82617aa283272be4', +}] + +dependencies = [('CUDA', '12.6.0')] + +sanity_check_paths = { + 'files': [ + 'include/cudnn.h', 'lib64/libcudnn_adv_static.a', 'lib64/libcudnn_cnn_static.a', + 'lib64/libcudnn_engines_precompiled_static.a', 'lib64/libcudnn_engines_runtime_compiled_static.a', + 'lib64/libcudnn_graph_static.a', 'lib64/libcudnn_heuristic_static.a', 'lib64/libcudnn_ops_static.a', + ], + 'dirs': ['include', 'lib64'], +} + +moduleclass = 'numlib' diff --git a/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.6.3.2-CUDA-12.6.0.eb b/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.6.3.2-CUDA-12.6.0.eb new file mode 100644 index 00000000000..865a0d77716 --- /dev/null +++ b/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.6.3.2-CUDA-12.6.0.eb @@ -0,0 +1,32 @@ +easyblock = 'Tarball' + +name = 'cuSPARSELt' +version = '0.6.3.2' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://docs.nvidia.com/cuda/cusparselt/index.html' +description = """NVIDIA cuSPARSELt is a high-performance CUDA library dedicated to general matrix-matrix operations in +which at least one operand is a sparse matrix""" + +toolchain = SYSTEM + +local_arch = {'arm64': 'sbsa', 'aarch64': 'sbsa'}.get(ARCH, ARCH) +source_urls = ['https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-%s/' % local_arch] +sources = ['libcusparse_lt-linux-%s-%%(version)s-archive.tar.xz' % local_arch] +checksums = [{ + 'libcusparse_lt-linux-x86_64-%(version)s-archive.tar.xz': + 'a2f856e78943f5c538bdef1c9edc64a5ed30bf8bb7d5fcb615c684ffe776cc31', + 'libcusparse_lt-linux-sbsa-%(version)s-archive.tar.xz': + '3e420ddbff4eb9ac603f57c7aa8b3d5271112816e244eb55ef9f30c4eb6a04b7', +}] + +dependencies = [('CUDA', '12.6.0')] + +sanity_check_paths = { + 'files': ['include/cusparseLt.h', + 'lib/libcusparseLt.%s' % SHLIB_EXT, + 'lib/libcusparseLt_static.a'], + 'dirs': [], +} + +moduleclass = 'lib' diff --git a/easybuild/easyconfigs/n/NCCL/NCCL-2.26.2-GCCcore-13.3.0-CUDA-12.6.0.eb b/easybuild/easyconfigs/n/NCCL/NCCL-2.26.2-GCCcore-13.3.0-CUDA-12.6.0.eb new file mode 100644 index 00000000000..5a167a50e7e --- /dev/null +++ b/easybuild/easyconfigs/n/NCCL/NCCL-2.26.2-GCCcore-13.3.0-CUDA-12.6.0.eb @@ -0,0 +1,26 @@ +name = 'NCCL' +version = '2.26.2' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://developer.nvidia.com/nccl' +description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective +communication primitives that are performance optimized for NVIDIA GPUs.""" + +toolchain = {'name': 'GCCcore', 'version': '13.3.0'} + +github_account = 'NVIDIA' +source_urls = [GITHUB_SOURCE] +sources = ['v%(version)s-1.tar.gz'] +checksums = ['74c6ab40c864d79c2139508e9419de5970cb406ec85f001d5f834d5f5c0c4f3b'] + +builddependencies = [('binutils', '2.42')] + +dependencies = [ + ('CUDA', '12.6.0', '', SYSTEM), + ('UCX-CUDA', '1.16.0', versionsuffix), +] + +# default CUDA compute capabilities to use (override via --cuda-compute-capabilities) +cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0'] + +moduleclass = 'lib' diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch new file mode 100644 index 00000000000..f07706b8d37 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch @@ -0,0 +1,14 @@ +Avoid tripping on //caffe2/test/cpp/jit:test_custom_class_registrations with IS_SANDCASTLE + +Author: Alexander Grund (TU Dresden) +--- a/torch/testing/_internal/torchbind_impls.py ++++ b/torch/testing/_internal/torchbind_impls.py +@@ -116,8 +116,6 @@ def load_torchbind_test_lib(): + + if IS_MACOS: + raise unittest.SkipTest("non-portable load_library call used in test") +- elif IS_SANDCASTLE or IS_FBCODE: +- lib_file_path = Path("//caffe2/test/cpp/jit:test_custom_class_registrations") + elif IS_WINDOWS: + lib_file_path = find_library_location("torchbind_test.dll") + else: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_do-not-checkout-nccl.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_do-not-checkout-nccl.patch new file mode 100644 index 00000000000..7b0dd830dd3 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_do-not-checkout-nccl.patch @@ -0,0 +1,36 @@ +Don't checkout NCCL when using system NCCL + +Author: Alexander Grund (TU Dresden) + +diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py +index 5dd5a221975..2b8b868eaa8 100644 +--- a/tools/build_pytorch_libs.py ++++ b/tools/build_pytorch_libs.py +@@ -7,7 +7,12 @@ from glob import glob + from pathlib import Path + + from .setup_helpers.cmake import CMake, USE_NINJA +-from .setup_helpers.env import check_negative_env_flag, IS_64BIT, IS_WINDOWS ++from .setup_helpers.env import ( ++ check_env_flag, ++ check_negative_env_flag, ++ IS_64BIT, ++ IS_WINDOWS, ++) + + + repo_root = Path(__file__).absolute().parent.parent +@@ -119,7 +124,12 @@ def build_pytorch( + cmake: CMake, + ) -> None: + my_env = _create_build_env() +- checkout_nccl() ++ if ( ++ not check_negative_env_flag("USE_CUDA") ++ and not check_negative_env_flag("USE_NCCL") ++ and not check_env_flag("USE_SYSTEM_NCCL") ++ ): ++ checkout_nccl() + build_test = not check_negative_env_flag("BUILD_TEST") + cmake.generate( + version, cmake_python_library, build_python, build_test, my_env, rerun_cmake diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch new file mode 100644 index 00000000000..8303cdbaa6a --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch @@ -0,0 +1,34 @@ +If there are no GPUs there would be a WORLD_SIZE=0 which doesn't work. +Use a positive number for the NCCL/GLOO tests in that case. + +See https://github.com/pytorch/pytorch/pull/150764 + +Author: Alexander Grund (TU Dresden) +diff --git a/test/run_test.py b/test/run_test.py +index a508d8db4d2..e7bbe6ea086 100755 +--- a/test/run_test.py ++++ b/test/run_test.py +@@ -610,18 +610,19 @@ DISTRIBUTED_TESTS_CONFIG = {} + + + if dist.is_available(): ++ num_gpus = torch.cuda.device_count() + DISTRIBUTED_TESTS_CONFIG["test"] = {"WORLD_SIZE": "1"} + if not TEST_WITH_ROCM and dist.is_mpi_available(): + DISTRIBUTED_TESTS_CONFIG["mpi"] = { + "WORLD_SIZE": "3", + } +- if dist.is_nccl_available(): ++ if dist.is_nccl_available() and num_gpus > 0: + DISTRIBUTED_TESTS_CONFIG["nccl"] = { +- "WORLD_SIZE": f"{torch.cuda.device_count()}", ++ "WORLD_SIZE": f"{num_gpus}", + } +- if dist.is_gloo_available(): ++ if dist.is_gloo_available() and num_gpus > 0: + DISTRIBUTED_TESTS_CONFIG["gloo"] = { + # TODO: retire testing gloo with CUDA +- "WORLD_SIZE": f"{torch.cuda.device_count()}", ++ "WORLD_SIZE": f"{num_gpus}", + } + # Test with UCC backend is deprecated. diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-skip-decorators.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-skip-decorators.patch new file mode 100644 index 00000000000..f638ac4f843 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-skip-decorators.patch @@ -0,0 +1,112 @@ +The decorators are implemented to run when the function is called which is after +the test `setup` method spawned subprocesses which may use NCCL to sync failing when there are +not enough GPUs available. +So replace the custom code by calls to the `unittest` skip decorators. +See https://github.com/pytorch/pytorch/pull/109491 + +Author: Alexander Grund (TU Dresden) +diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py +index d34b1ffdb0a..8f9628f209b 100644 +--- a/torch/testing/_internal/common_distributed.py ++++ b/torch/testing/_internal/common_distributed.py +@@ -155,17 +155,7 @@ def skip_if_odd_worldsize(func): + + + def require_n_gpus_for_nccl_backend(n, backend): +- def decorator(func): +- @wraps(func) +- def wrapper(*args, **kwargs): +- if backend == "nccl" and torch.cuda.device_count() < n: +- sys.exit(TEST_SKIPS[f"multi-gpu-{n}"].exit_code) +- else: +- return func(*args, **kwargs) +- +- return wrapper +- +- return decorator ++ return unittest.skipUnless(at_least_x_gpu(n), TEST_SKIPS[f"multi-gpu-{n}"].message) if backend == "nccl" else unittest.skipIf(False, None) + + + def import_transformers_or_skip(): +@@ -197,20 +187,10 @@ def at_least_x_gpu(x): + + + def skip_if_lt_x_gpu(x): +- def decorator(func): +- @wraps(func) +- def wrapper(*args, **kwargs): +- if torch.cuda.is_available() and torch.cuda.device_count() >= x: +- return func(*args, **kwargs) +- if TEST_HPU and torch.hpu.device_count() >= x: +- return func(*args, **kwargs) +- if TEST_XPU and torch.xpu.device_count() >= x: +- return func(*args, **kwargs) +- sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code) +- +- return wrapper +- +- return decorator ++ return unittest.skipUnless(torch.cuda.device_count() >= x or ( ++ TEST_HPU and torch.hpu.device_count() >= x) or ( ++ TEST_XPU and torch.xpu.device_count() >= x), ++ TEST_SKIPS[f"multi-gpu-{x}"].message) + + + # This decorator helps avoiding initializing cuda while testing other backends +diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py +index a4d6d53b975..0da1d9baddf 100644 +--- a/torch/testing/_internal/distributed/distributed_test.py ++++ b/torch/testing/_internal/distributed/distributed_test.py +@@ -66,7 +66,6 @@ from torch.testing._internal.common_distributed import ( + skip_if_small_worldsize, + skip_if_odd_worldsize, + skip_if_lt_x_gpu, +- nccl_skip_if_lt_x_gpu, + skip_if_no_gpu, + require_n_gpus_for_nccl_backend, + requires_nccl_version, +@@ -5299,7 +5298,7 @@ class DistributedTest: + BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo", + "get_future is only supported on mpi, nccl and gloo", + ) +- @nccl_skip_if_lt_x_gpu(BACKEND, 2) ++ @require_n_gpus_for_nccl_backend(2, BACKEND) + def test_accumulate_gradients_no_sync(self): + """ + Runs _test_accumulate_gradients_no_sync using default inputs +@@ -5310,7 +5309,7 @@ class DistributedTest: + BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo", + "get_future is only supported on mpi, nccl and gloo", + ) +- @nccl_skip_if_lt_x_gpu(BACKEND, 2) ++ @require_n_gpus_for_nccl_backend(2, BACKEND) + def test_accumulate_gradients_no_sync_grad_is_view(self): + """ + Runs _test_accumulate_gradients_no_sync using default inputs +@@ -5321,7 +5320,7 @@ class DistributedTest: + BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo", + "get_future is only supported on mpi, nccl and gloo", + ) +- @nccl_skip_if_lt_x_gpu(BACKEND, 2) ++ @require_n_gpus_for_nccl_backend(2, BACKEND) + def test_accumulate_gradients_no_sync_allreduce_hook(self): + """ + Runs multiple iterations on _test_accumulate_gradients_no_sync +@@ -5349,7 +5348,7 @@ class DistributedTest: + BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo", + "get_future is only supported on mpi, nccl and gloo", + ) +- @nccl_skip_if_lt_x_gpu(BACKEND, 2) ++ @require_n_gpus_for_nccl_backend(2, BACKEND) + def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self): + """ + Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce +@@ -5383,7 +5382,7 @@ class DistributedTest: + BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo", + "get_future is only supported on mpi, nccl and gloo", + ) +- @nccl_skip_if_lt_x_gpu(BACKEND, 2) ++ @require_n_gpus_for_nccl_backend(2, BACKEND) + def test_get_future(self): + def mult(fut): + return [t * 3 for t in fut.wait()] diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_embedding.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_embedding.patch new file mode 100644 index 00000000000..d6e02ac25a6 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_embedding.patch @@ -0,0 +1,36 @@ +TestSelectAlgorithmCPU.test_linear_with_embedding fails when the CPU does not support BF16: +> torch._inductor.exc.InductorError: LoweringException: RuntimeError: self and mat2 must have the same dtype, but got Float and BFloat16 +See https://github.com/pytorch/pytorch/issues/147104 + +Convert the embedding layer to avoid it using "Float" and adapt the check for this change. + +Author: Alexander Grund (TU Dresden) +--- a/test/inductor/test_cpu_select_algorithm.py ++++ b/test/inductor/test_cpu_select_algorithm.py +@@ -932,6 +932,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm): + def test_linear_with_embedding( + self, batch_size, in_features, out_features, bias, dtype + ): ++ has_bf16 = torch.ops.mkldnn._is_mkldnn_bf16_supported() + class M(torch.nn.Module): + def __init__(self, bias): + super().__init__() +@@ -939,6 +940,9 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm): + dtype=dtype + ) + self.emb = torch.nn.Embedding(64, out_features) ++ if not has_bf16: ++ self.emb = self.emb.to(dtype=dtype) ++ + + def forward(self, idx, x): + return self.emb(idx) + self.linear(x) +@@ -953,7 +957,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm): + with verify(dtype) as (atol, rtol): + self.common(mod, (idx, x), atol=atol, rtol=rtol) + self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1) +- self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1) ++ self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1 if has_bf16 else 0) + + @inductor_config.patch({"freezing": True}) + @patches diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch new file mode 100644 index 00000000000..6067d565e20 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch @@ -0,0 +1,17 @@ +This test fails when FlexiBLAS is used instead of MKL. +Adjust the expected count. +See https://github.com/pytorch/pytorch/pull/151548 + +Author: Alexander Grund (TU Dresden) +--- a/test/inductor/test_cpu_select_algorithm.py ++++ b/test/inductor/test_cpu_select_algorithm.py +@@ -1300,7 +1304,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm): + rtol=rtol, + ) + self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 2) +- self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2) ++ self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2 if TEST_MKL else 1) + + @inductor_config.patch({"freezing": True}) + @patches + diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_skip-test_init_from_local_shards.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_skip-test_init_from_local_shards.patch new file mode 100644 index 00000000000..942b2758b3b --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.0_skip-test_init_from_local_shards.patch @@ -0,0 +1,24 @@ +The test often times out and seems to be considered flaky by PyTorch: +https://github.com/pytorch/pytorch/issues/78068 + +Author: Alexander Grund (TU Dresden) +diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py +index 730b2c2c0ac..5f9b9545700 100644 +--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py ++++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py +@@ -6,6 +6,7 @@ import itertools + import math + import pickle + import sys ++from unittest import skip + + import torch + import torch.distributed as dist +@@ -2432,6 +2432,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase): + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() ++ @skip("Times out often") + def test_init_from_local_shards(self): + local_shard_metadata = ShardMetadata( + shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5], diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1-foss-2024a-CUDA-12.6.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1-foss-2024a-CUDA-12.6.0.eb new file mode 100644 index 00000000000..9e9de33ed13 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1-foss-2024a-CUDA-12.6.0.eb @@ -0,0 +1,291 @@ +name = 'PyTorch' +version = '2.7.1' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://pytorch.org/' +description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration. +PyTorch is a deep learning framework that puts Python first.""" + +toolchain = {'name': 'foss', 'version': '2024a'} + +local_six_version = '1.11.0' +source_urls = [GITHUB_RELEASE] +sources = [ + '%(namelower)s-v%(version)s.tar.gz', + { + # Avoid downloading this during the build, see third_party/NNPACK/cmake/DownloadSix.cmake for the version + 'filename': f'six-{local_six_version}.tar.gz', + 'source_urls': + ['https://pypi.python.org/packages/16/d8/bc6316cf98419719bd59c91742194c111b6f2e85abac88e496adefaf7afe'], + } +] +patches = [ + 'PyTorch-1.12.1_add-hypothesis-suppression.patch', + 'PyTorch-1.13.1_skip-failing-singular-grad-test.patch', + 'PyTorch-1.7.0_disable-dev-shm-test.patch', + 'PyTorch-2.0.1_avoid-test_quantization-failures.patch', + 'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch', + 'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch', + 'PyTorch-2.1.0_remove-test-requiring-online-access.patch', + 'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch', + 'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch', + 'PyTorch-2.3.0_disable_test_linear_package_if_no_half_types_are_available.patch', + 'PyTorch-2.3.0_skip_test_var_mean_differentiable.patch', + 'PyTorch-2.6.0_disable_DataType_dependent_test_if_tensorboard_is_not_available.patch', + 'PyTorch-2.6.0_disable_tests_which_need_network_download.patch', + 'PyTorch-2.6.0_disable-gcc12-warnings.patch', + 'PyTorch-2.6.0_fix-accuracy-issues-in-linalg_solve.patch', + 'PyTorch-2.6.0_fix-server-in-test_control_plane.patch', + 'PyTorch-2.6.0_fix-vsx-vector-shift-functions.patch', + 'PyTorch-2.6.0_increase-tolerance-test_aotdispatch-matmul.patch', + 'PyTorch-2.6.0_increase-tolerance-test_quick-baddbmm.patch', + 'PyTorch-2.6.0_increase-tolerance-test_vmap_autograd_grad.patch', + 'PyTorch-2.6.0_show-test-duration.patch', + 'PyTorch-2.6.0_skip-diff-test-on-ppc.patch', + 'PyTorch-2.6.0_skip-test_segfault.patch', + 'PyTorch-2.6.0_skip-test-requiring-MKL.patch', + 'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch', + 'PyTorch-2.7.0_do-not-checkout-nccl.patch', + 'PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch', + 'PyTorch-2.7.0_fix-skip-decorators.patch', + 'PyTorch-2.7.0_fix-test_linear_with_embedding.patch', + 'PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch', + 'PyTorch-2.7.0_skip-test_init_from_local_shards.patch', + 'PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch', + 'PyTorch-2.7.1_do-not-rpath-stubs-dir-in-jit-code.patch', + 'PyTorch-2.7.1_exit-test_c10d_ops_nccl-with-succes-when-no-gpu.patch', + 'PyTorch-2.7.1_fix-cuda-12.6-driver-api-usage.patch', + 'PyTorch-2.7.1_fix-CUDASymmetricMemory-group.patch', + 'PyTorch-2.7.1_fix-nccl-test-env.patch', + 'PyTorch-2.7.1_fix-test_ck_blas_library_cpu.patch', + 'PyTorch-2.7.1_fix-test_fsdp_ep.patch', + 'PyTorch-2.7.1_fix-test_ir_count.patch', + 'PyTorch-2.7.1_fix-test_torchinductor_dynamic-tests.patch', + 'PyTorch-2.7.1_increase-tolerance-for-sum-reduction-test.patch', + 'PyTorch-2.7.1_increase-tolerance-for-test_freeze_conv_relu_fusion.patch', + 'PyTorch-2.7.1_init-cutlass-include-dirs.patch', + 'PyTorch-2.7.1_keep-CMAKE_PREFIX_PATH-in-test.patch', + 'PyTorch-2.7.1_remove-faulty-close.patch', + 'PyTorch-2.7.1_remove-test_close_multi_pg_unordered.patch', + 'PyTorch-2.7.1_serialize-test_host_memory_stats.patch', + 'PyTorch-2.7.1_skip-failing-max_autotune-tests.patch', + 'PyTorch-2.7.1_skip-failing-schedule-test.patch', + 'PyTorch-2.7.1_skip-NCCL-tests-without-GPUs.patch', + 'PyTorch-2.7.1_skip-test_benchmark_on_non_zero_device.patch', + 'PyTorch-2.7.1_skip-test_data_parallel_rnn.patch', + 'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch', + 'PyTorch-2.7.1_skip-test_lowering_one_shot_all_reduce.patch', + 'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch', + 'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch', + 'PyTorch-2.7.1_skip-test_override-without-CUDA.patch', + 'PyTorch-2.7.1_skip-TestFP8Lowering.patch', + 'PyTorch-2.7.1_skip-tests-requiring-cc89.patch', + 'PyTorch-2.7.1_skip-tests-requiring-SM90.patch', + 'PyTorch-2.7.1_suport-64bit-BARs.patch', + 'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch', +] +checksums = [ + {'pytorch-v2.7.1.tar.gz': '5befd2e540fd55ce4782d0ca7610ce5b572d756d7ea38090ef0f3c7c428fb20f'}, + {f"six-{local_six_version}.tar.gz": "70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9"}, + {'PyTorch-1.12.1_add-hypothesis-suppression.patch': + 'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'}, + {'PyTorch-1.13.1_skip-failing-singular-grad-test.patch': + '72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'}, + {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'}, + {'PyTorch-2.0.1_avoid-test_quantization-failures.patch': + '02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'}, + {'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch': + '7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'}, + {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch': + '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'}, + {'PyTorch-2.1.0_remove-test-requiring-online-access.patch': + '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'}, + {'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch': + '6298daf9ddaa8542850eee9ea005f28594ab65b1f87af43d8aeca1579a8c4354'}, + {'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch': + 'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'}, + {'PyTorch-2.3.0_disable_test_linear_package_if_no_half_types_are_available.patch': + '23416f2d9d5226695ec3fbea0671e3650c655c19deefd3f0f8ddab5afa50f485'}, + {'PyTorch-2.3.0_skip_test_var_mean_differentiable.patch': + '9703fd0f1fca8916f6d79d83e9a7efe8e3f717362a5fdaa8f5d9da90d0c75018'}, + {'PyTorch-2.6.0_disable_DataType_dependent_test_if_tensorboard_is_not_available.patch': + '74db866787f1e666ed3b35db5204f05a0ba8d989fb23057a72dd07928388dc46'}, + {'PyTorch-2.6.0_disable_tests_which_need_network_download.patch': + 'fe76129811e4eb24d0e12c397335a4c7971b0c4e48ce9cdb9169f3ef9de7aac4'}, + {'PyTorch-2.6.0_disable-gcc12-warnings.patch': '892643650788b743106ebe4e70c68be42a756eba797f0f79e31708d6e008a620'}, + {'PyTorch-2.6.0_fix-accuracy-issues-in-linalg_solve.patch': + 'a6b1cfe8f03ad5b17437e04e6a0369a25fcc79eed939ce6912ceca1c0ab0f444'}, + {'PyTorch-2.6.0_fix-server-in-test_control_plane.patch': + '1337689ff28ecaa8d1d0edf60d322bcdd7846fec040925325d357b19eb6e4342'}, + {'PyTorch-2.6.0_fix-vsx-vector-shift-functions.patch': + '82ce0b48e3b7c3dfd3a2ba915f4675d5c3a6d149646e1e0d6a29eedbbaecc8bd'}, + {'PyTorch-2.6.0_increase-tolerance-test_aotdispatch-matmul.patch': + 'c1c6ea41504e4479d258225ecefc7e9c5726934601610904ae555501a11e9109'}, + {'PyTorch-2.6.0_increase-tolerance-test_quick-baddbmm.patch': + '9850facdfb5d98451249570788217ede07466cae9ba52cd03afd3ec803ba33c9'}, + {'PyTorch-2.6.0_increase-tolerance-test_vmap_autograd_grad.patch': + '8d5eb53bb0a1456af333ae646c860033d6dd037bd9152601a200ca5c10ebf3cb'}, + {'PyTorch-2.6.0_show-test-duration.patch': '5508f2f9619204d9f3c356dbd4000a00d58f452ab2d64ae920eb8bc8b5484d75'}, + {'PyTorch-2.6.0_skip-diff-test-on-ppc.patch': '6f2f87cad1b0ab8c5a0c7b3f7fbc14e4bdfbe61da26a3934ded9dda7fe368c74'}, + {'PyTorch-2.6.0_skip-test_segfault.patch': '26806bd62e6b61b56ebaa52d68ca44c415a28124f684bd2fb373557ada68ef52'}, + {'PyTorch-2.6.0_skip-test-requiring-MKL.patch': 'f1c9b1c77b09d59317fd52d390e7d948a147325b927ad6373c1fa1d1d6ea1ea8'}, + {'PyTorch-2.7.0_avoid_caffe2_test_cpp_jit.patch': + '2f3255e067f5c6f0d78b4fbce94784c41bddf3d01bab9673856b0d0bbc4e3fec'}, + {'PyTorch-2.7.0_do-not-checkout-nccl.patch': 'ad085a15dd36768ad33a934f53dc595da745e01697b44d431f8b70ae9d0eb567'}, + {'PyTorch-2.7.0_fix-distributed-tests-without-gpus.patch': + '99d92db44f856b2fb05c221f201e50c21e57a7f6f35824f8274a380875029f24'}, + {'PyTorch-2.7.0_fix-skip-decorators.patch': 'a5197594f8b076f9a2d03ae3aa725018d55889b737a12b74d6872b5c1bd1e809'}, + {'PyTorch-2.7.0_fix-test_linear_with_embedding.patch': + '276b100a4a405fae6a9517cec1ca166b6f8097668f08f7e20aacf3cb766f9a2a'}, + {'PyTorch-2.7.0_fix-test_linear_with_in_out_buffer-without-mkl.patch': + '507931ad00afab098ef9df99ac32c28c61c11ca0e0ac2c55570d9b9e7dc8ef38'}, + {'PyTorch-2.7.0_skip-test_init_from_local_shards.patch': + '655e57763c6ddc3d8b52ed67aaf0f59874441a69161d10c14ab8860f9be5c332'}, + {'PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch': + 'aaf22cb431357dc78e4db895d64febf1c7ee187e8ad27bd13544d011127354d4'}, + {'PyTorch-2.7.1_do-not-rpath-stubs-dir-in-jit-code.patch': + '6ab92ce23618c74a4950a6dc652d8ea1ff03c101c4f93a9186da29e136b17b1a'}, + {'PyTorch-2.7.1_exit-test_c10d_ops_nccl-with-succes-when-no-gpu.patch': + 'ad46a9167ceeafe073618588b2ca13cdef431aae732713b5dc545a93eb9cd076'}, + {'PyTorch-2.7.1_fix-cuda-12.6-driver-api-usage.patch': + '51b2e51ff8419f263f0b9f4352fb503f6f48f2950076f9596b299ff2a0121747'}, + {'PyTorch-2.7.1_fix-CUDASymmetricMemory-group.patch': + '9184b48af7b7caa77f038c911c43cd85f0daa6992f1197adb0ad27b80f5fc40a'}, + {'PyTorch-2.7.1_fix-nccl-test-env.patch': 'ddb052d217c9811aa2c96e71d52149d2e531b9dfb3b14ca4c7d87d33f54d30cd'}, + {'PyTorch-2.7.1_fix-test_ck_blas_library_cpu.patch': + '9df61b4ed2bd7f4a30df463cd2c5d4cb84d57932909b34dfa360d214425a5fee'}, + {'PyTorch-2.7.1_fix-test_fsdp_ep.patch': '9cd2da8027e440dd3069fcbd5692703dbfbb9fa9046ebcc5092669a10408b6ef'}, + {'PyTorch-2.7.1_fix-test_ir_count.patch': 'ba3dc48ee356d48ced89e2d6fceb8c8e91caccd0bda600e4ec4c3540cf434cad'}, + {'PyTorch-2.7.1_fix-test_torchinductor_dynamic-tests.patch': + '1343babbe9fb8a2cc8a12481647340e50f63beffbfa1e92ed4e5a6203f857af4'}, + {'PyTorch-2.7.1_increase-tolerance-for-sum-reduction-test.patch': + '1280259d12a4cf9fcfc22f4b92796072f9ee37b9734c77ebdcbf43d42235d15a'}, + {'PyTorch-2.7.1_increase-tolerance-for-test_freeze_conv_relu_fusion.patch': + 'e24f7fa6f43c5ea0fab2c2b876644649948aabae0b2d239e845e20dfd607b7e6'}, + {'PyTorch-2.7.1_init-cutlass-include-dirs.patch': + '682a295519c81afb692caba66eb5e64570f938525e7ded803627884c382d509a'}, + {'PyTorch-2.7.1_keep-CMAKE_PREFIX_PATH-in-test.patch': + '516d0a9a7490999f979eb9e53ae1efd6fdea6ed5f94c9dbd659fb1e5d1fd022b'}, + {'PyTorch-2.7.1_remove-faulty-close.patch': '315fca3c582534f20da62078156c91b38637f1358cd166b4d33ba964c7b07f95'}, + {'PyTorch-2.7.1_remove-test_close_multi_pg_unordered.patch': + '65a6d430ec359b9fee5f389e05b4c4c592db1ddc12fdab550445b52f3f2a7bfe'}, + {'PyTorch-2.7.1_serialize-test_host_memory_stats.patch': + 'ed17602b0458c9d954cfe0c0d7373a2beee13f1ee8eccf3d5f8131980e319ef0'}, + {'PyTorch-2.7.1_skip-failing-max_autotune-tests.patch': + '8611605060088b0178834d34621d407c6ba03803d65e433971f458c05adf0c10'}, + {'PyTorch-2.7.1_skip-failing-schedule-test.patch': + '50151c6792d64c1e01533218b70f0ed974b934334b008e6df558ae8a9b999910'}, + {'PyTorch-2.7.1_skip-NCCL-tests-without-GPUs.patch': + '550c6976b9e3305ceb25cf2de5d135ca771c49acccd2d331c724ade8ccaecde2'}, + {'PyTorch-2.7.1_skip-test_benchmark_on_non_zero_device.patch': + '1c35b207b6fcc24fbfcdc7552fb4f0c9b77233f8a9032a7caa2b8f94d33491d0'}, + {'PyTorch-2.7.1_skip-test_data_parallel_rnn.patch': + 'aa85b678e89db4bb41d2c5f4990f0d05959be92e61918291cb5609685b7f1841'}, + {'PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch': + '503030c3591196510a3c2d95db30b28a0b396adb8b50ff0d221f6bdb1f939935'}, + {'PyTorch-2.7.1_skip-test_lowering_one_shot_all_reduce.patch': + 'c5235fab6cac29adfa61238ddfa71bee18c470e7b3b58f18cc585a1dc3fbeb65'}, + {'PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch': + '709288abc802c9eb687c15f2677ebaf408d8325a4cb470d23cb72447ee0b8e13'}, + {'PyTorch-2.7.1_skip-test_outside_linear_module_free.patch': + '4916a256b2b9914e4fdb930681b80df93ea561ddee2fc9978c4973a5650be5e9'}, + {'PyTorch-2.7.1_skip-test_override-without-CUDA.patch': + 'a94654b9ba492be1cef0c8f266d1e16e0c5efb35816164f8c2bfdfda2dfa65f5'}, + {'PyTorch-2.7.1_skip-TestFP8Lowering.patch': 'a1b5d15795d1c776fa7dca9e3eb8c5335d940e6961cb7d9980d1bfe49b847391'}, + {'PyTorch-2.7.1_skip-tests-requiring-cc89.patch': + 'ac39e77339196d734837792791baa058732fb2e87180f6007ae5512028b68659'}, + {'PyTorch-2.7.1_skip-tests-requiring-SM90.patch': + '7b5891a96b58d1d404c130233ec5ddbb0ad52afdb9c334bbe4e1f27f6c78ffd8'}, + {'PyTorch-2.7.1_suport-64bit-BARs.patch': '317c3d220aa87426d86e137a6c1a8f910adf9580ca0848371e0f6800c05dbde1'}, + {'PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch': + 'f304440a57e00b8052a5ffbf285adad8d0fdc5a812a659420b59a20deb5a9942'}, +] + +osdependencies = [OS_PKG_IBVERBS_DEV] + +builddependencies = [ + ('CMake', '3.29.3'), + ('hypothesis', '6.103.1'), + ('pybind11', '2.12.0'), + # For tests + ('parameterized', '0.9.0'), + ('pytest-flakefinder', '1.1.0'), + ('pytest-rerunfailures', '15.0'), + ('pytest-shard', '0.1.2'), + ('pytest-subtests', '0.13.1'), + ('tlparse', '0.3.37'), + ('optree', '0.14.1'), + ('unittest-xml-reporting', '3.1.0'), +] + +dependencies = [ + ('CUDA', '12.6.0', '', SYSTEM), + # PyTorch is very sensitive to the NCCL & cuDNN versions. (Maybe the same for cuSPARSELt) + # Prefer those (listed per CUDA version) in + # https://github.com/pytorch/pytorch/blob/main/.github/scripts/generate_binary_build_matrix.py + # or https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_cuda.sh + ('NCCL', '2.26.2', versionsuffix), + ('cuDNN', '9.5.1.17', versionsuffix, SYSTEM), + ('magma', '2.9.0', versionsuffix), + ('cuSPARSELt', '0.6.3.2', versionsuffix, SYSTEM), + ('nvidia-cutlass', '3.8.0.0', versionsuffix), + # Version from .ci/docker/triton_version.txt + ('Triton', '3.3.1', versionsuffix), + ('Ninja', '1.12.1'), # Required for JIT compilation of C++ extensions + ('Python', '3.12.3'), + ('Python-bundle-PyPI', '2024.06'), + ('expecttest', '0.2.1'), + ('GMP', '6.3.0'), + ('MPFR', '4.2.1'), + ('networkx', '3.4.2'), + ('numactl', '2.0.18'), + ('Pillow', '10.4.0'), + ('protobuf-python', '5.28.0'), + ('protobuf', '28.0'), + ('PuLP', '2.8.0'), + ('PyYAML', '6.0.2'), + ('pyzstd', '0.16.2'), + ('SciPy-bundle', '2024.05'), + ('sympy', '1.13.3'), + ('Z3', '4.13.0',), +] + +buildcmd = '%(python)s setup.py build' # Run the (long) build in the build step + +excluded_tests = { + '': [ + # This test seems to take too long on NVIDIA Ampere at least. + 'distributed/test_distributed_spawn', + # no xdoctest + 'doctests', + # intermittent failures on various systems + # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712 + 'distributed/rpc/test_tensorpipe_agent', + # This test is expected to fail when run in their CI, but won't in our case. + # It just checks for a "CI" env variable + 'test_ci_sanity_check_fail', + # Requires pwlf Python package + 'distributed/_tools/test_sac_ilp', 'distributed/_tools/test_sac_estimator', + # 9 failures in H100, 7 are present in PYPI package, 2 are related to GC in Python < 3.12.4 + 'dynamo/test_dynamic_shapes', + # Broken test: https://github.com/pytorch/pytorch/issues/162179 + 'distributed/_composable/fsdp/test_fully_shard_logging', + # Broken: https://github.com/pytorch/pytorch/issues/137027 + 'inductor/test_extension_backend', + # Requires optional Python packages + 'test_public_bindings', + # 1 Failure and not important + 'dynamo/test_utils', + ] +} + +local_test_opts = '--continue-through-error --pipe-logs --verbose %(excluded_tests)s' +runtest = 'cd test && PYTEST_ADDOPTS=--full-trace PYTHONUNBUFFERED=1 %(python)s run_test.py ' + local_test_opts + +# ATTENTION: NVIDIA Volta not (fully) supported anymore. +# Allow some more tests to fail, especially due to that +max_failed_tests = 60 + +tests = ['PyTorch-check-cpp-extension.py'] + +moduleclass = 'ai' diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch new file mode 100644 index 00000000000..bb3103160a7 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_avoid-caffe2-sandcastle-test-lib.patch @@ -0,0 +1,18 @@ +"//caffe2/test/inductor:custom_ops" is a FB-specific "library" which we pull in by setting IS_SANDCASTLE causing +> OSError: /caffe2/test/inductor:custom_ops: cannot open shared object file: No such file or directory +in inductor/test_aot_inductor_custom_ops.py + +Author: Alexander Grund (TU Dresden) +diff --git a/test/inductor/test_aot_inductor_custom_ops.py b/test/inductor/test_aot_inductor_custom_ops.py +index ce2ef3739d3..7b9dc4792fd 100644 +--- a/test/inductor/test_aot_inductor_custom_ops.py ++++ b/test/inductor/test_aot_inductor_custom_ops.py +@@ -380,7 +380,7 @@ common_utils.instantiate_parametrized_tests(AOTInductorTestsTemplate) + + class AOTICustomOpTestCase(TestCase): + def setUp(self): +- if IS_SANDCASTLE or IS_FBCODE: ++ if False: + torch.ops.load_library("//caffe2/test/inductor:custom_ops") + elif IS_MACOS: + raise unittest.SkipTest("non-portable load_library call used in test") diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_do-not-rpath-stubs-dir-in-jit-code.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_do-not-rpath-stubs-dir-in-jit-code.patch new file mode 100644 index 00000000000..f03c76c397a --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_do-not-rpath-stubs-dir-in-jit-code.patch @@ -0,0 +1,30 @@ +JIT generated code fails at runtime to call any CUDA function because it rpathes our stubs library. +See https://github.com/pytorch/pytorch/pull/160179 + +Errors look like +> cutlass_library/source/tools/util/include/cutlass/util/device_memory.h:67 cutlass::device_memory::allocate: cudaMalloc failed: bytes=4096 +> terminate called after throwing an instance of 'cutlass::cuda_exception' +> what(): std::exception + + +Author: Alexander Grund (TU Dresden) + +diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py +index dc9f5c25365..432b445b4ac 100644 +--- a/torch/_inductor/codecache.py ++++ b/torch/_inductor/codecache.py +@@ -2848,9 +2852,13 @@ def _cuda_lib_options() -> list[str]: + if is_linux(): + _transform_cuda_paths(lpaths) + for path in lpaths: ++ extra_ldflags.append(f"-L{path}") + # -rpath ensures the DLL can find its dependencies when loaded, even + # if the library path is non-standard. +- extra_ldflags.extend([f"-L{path}", "-Xlinker", f"-rpath={path}"]) ++ # But do not add the stubs folder to rpath as the driver is expected to be found at runtime ++ if os.path.basename(path) != "stubs": ++ extra_ldflags.extend(["-Xlinker", f"-rpath={path}"]) ++ + extra_ldflags.append("-lcuda") + extra_ldflags.append("-lcudart") + else: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_exit-test_c10d_ops_nccl-with-succes-when-no-gpu.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_exit-test_c10d_ops_nccl-with-succes-when-no-gpu.patch new file mode 100644 index 00000000000..97b18fadd80 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_exit-test_c10d_ops_nccl-with-succes-when-no-gpu.patch @@ -0,0 +1,20 @@ +When the test file is run without GPUs it will exit with a non-zero code +which causes it to be considered a failure. +Exit with code 0 instead when no GPUs are available. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/test_c10d_ops_nccl.py b/test/distributed/test_c10d_ops_nccl.py +index 73bad39956c..0d8d3ba9628 100644 +--- a/test/distributed/test_c10d_ops_nccl.py ++++ b/test/distributed/test_c10d_ops_nccl.py +@@ -982,7 +982,8 @@ class ProcessGroupNCCLOpTest(MultiProcContinousTest): + + if __name__ == "__main__": + if not torch.cuda.is_available(): +- sys.exit(TEST_SKIPS["no_cuda"].exit_code) ++ print(TEST_SKIPS["no_cuda"].message) ++ sys.exit(0) + + rank = int(os.getenv("RANK", -1)) + world_size = int(os.getenv("WORLD_SIZE", -1)) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-CUDASymmetricMemory-group.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-CUDASymmetricMemory-group.patch new file mode 100644 index 00000000000..eb788b47bb2 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-CUDASymmetricMemory-group.patch @@ -0,0 +1,32 @@ +Fix test test/distributed/test_c10d_nccl.py CommTest.test_intra_node_comm_all_reduce failing with +> RuntimeError: get_group_info: no group info associated with the group name + +From 9108d153ce49fc31c1e8d71640e19b0dcd159dcc Mon Sep 17 00:00:00 2001 +From: eqy +Date: Wed, 26 Mar 2025 03:59:43 +0000 +Subject: [PATCH] [CUDA]][SymmetricMemory] Interpret empty string as + `std::nullopt` in `rendezvous` (#149793) + +this is a "temporary" fix as current internal API requires strings at some interfaces instead of `std::optional` and empty strings are presumably used in-lieu of `nullopt`. +e.g., +https://github.com/pytorch/pytorch/blob/9d02b3993f7dae7fa3379d5190ac88291ecd4dce/torch/csrc/distributed/c10d/intra_node_comm.cu#L49 + +this currently breaks `test_intra_node_comm_all_reduce` + +Pull Request resolved: https://github.com/pytorch/pytorch/pull/149793 + +diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu +index 172304479e9e..721d2c815875 100644 +--- a/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu ++++ b/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu +@@ -784,7 +784,9 @@ c10::intrusive_ptr CUDASymmetricMemoryAllocator::rendezvous( + // The group_name passed to rendezvous() takes precedence over + // the default group_name specified during allocation. + std::string group_name_; +- if (group_name.has_value()) { ++ // Treat empty string and std::nullopt the same as empty string seems to be ++ // implicitly used that way ++ if (group_name.has_value() && group_name != "") { + group_name_ = *group_name; + } else { + if (!block->default_group_name.has_value()) { diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-cuda-12.6-driver-api-usage.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-cuda-12.6-driver-api-usage.patch new file mode 100644 index 00000000000..7b99e0cce32 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-cuda-12.6-driver-api-usage.patch @@ -0,0 +1,133 @@ +Avoid "RuntimeError: CUDA driver error: operation not supported" due to use of CUDA 12.6 in e.g. +> python distributed/test_symmetric_memory.py SymmMemSingleProcTest.test_stream_write_value32 + +Backport of https://github.com/pytorch/pytorch/commit/cf90c9f8d1632777ec5f4b6ccaa14bc5bf259e9c + +Author: Alexander Grund (TU Dresden) + +diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp +index bb201b5c039..56e3ffb02ab 100644 +--- a/c10/cuda/driver_api.cpp ++++ b/c10/cuda/driver_api.cpp +@@ -2,6 +2,7 @@ + #include + #include + #include ++#include + #include + + namespace c10::cuda { +@@ -9,20 +10,13 @@ namespace c10::cuda { + namespace { + + DriverAPI create_driver_api() { +- void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD); +- TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror()); + void* handle_1 = DriverAPI::get_nvml_handle(); + DriverAPI r{}; + +-#define LOOKUP_LIBCUDA_ENTRY(name) \ +- r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \ +- TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror()) ++#define LOOKUP_LIBCUDA_ENTRY(name) \ ++ r.name##_ = reinterpret_cast(get_symbol(#name)); \ ++ TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name) + C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY) +-#undef LOOKUP_LIBCUDA_ENTRY +- +-#define LOOKUP_LIBCUDA_ENTRY(name) \ +- r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \ +- dlerror(); + C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY) + #undef LOOKUP_LIBCUDA_ENTRY + +@@ -47,6 +41,52 @@ C10_EXPORT DriverAPI* DriverAPI::get() { + return &singleton; + } + ++typedef cudaError_t (*VersionedGetEntryPoint)( ++ const char*, ++ void**, ++ unsigned int, ++ unsigned long long, // NOLINT(*) ++ cudaDriverEntryPointQueryResult*); ++typedef cudaError_t (*GetEntryPoint)( ++ const char*, ++ void**, ++ unsigned long long, // NOLINT(*) ++ cudaDriverEntryPointQueryResult*); ++ ++void* get_symbol(const char* symbol, int cuda_version) { ++ // We link to the libcudart.so already, so can search for it in the current ++ // context ++ static GetEntryPoint driver_entrypoint_fun = reinterpret_cast( ++ dlsym(RTLD_DEFAULT, "cudaGetDriverEntryPoint")); ++ static VersionedGetEntryPoint driver_entrypoint_versioned_fun = ++ reinterpret_cast( ++ dlsym(RTLD_DEFAULT, "cudaGetDriverEntryPointByVersion")); ++ ++ cudaDriverEntryPointQueryResult driver_result{}; ++ void* entry_point = nullptr; ++ if (driver_entrypoint_versioned_fun != nullptr) { ++ // Found versioned entrypoint function ++ cudaError_t result = driver_entrypoint_versioned_fun( ++ symbol, &entry_point, cuda_version, cudaEnableDefault, &driver_result); ++ TORCH_CHECK( ++ result == cudaSuccess, ++ "Error calling cudaGetDriverEntryPointByVersion"); ++ } else { ++ TORCH_CHECK( ++ driver_entrypoint_fun != nullptr, ++ "Error finding the CUDA Runtime-Driver interop."); ++ // Versioned entrypoint function not found ++ cudaError_t result = driver_entrypoint_fun( ++ symbol, &entry_point, cudaEnableDefault, &driver_result); ++ TORCH_CHECK(result == cudaSuccess, "Error calling cudaGetDriverEntryPoint"); ++ } ++ TORCH_CHECK( ++ driver_result == cudaDriverEntryPointSuccess, ++ "Could not find CUDA driver entry point for ", ++ symbol); ++ return entry_point; ++} ++ + } // namespace c10::cuda + + #endif +diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h +index 65cbdfe878d..1a1f0108e69 100644 +--- a/c10/cuda/driver_api.h ++++ b/c10/cuda/driver_api.h +@@ -3,6 +3,12 @@ + #define NVML_NO_UNVERSIONED_FUNC_DEFS + #include + ++#if defined(CUDA_VERSION) ++#define DEFAULT_CUDA_VERSION CUDA_VERSION ++#else ++#define DEFAULT_CUDA_VERSION 11080 ++#endif ++ + #define C10_CUDA_DRIVER_CHECK(EXPR) \ + do { \ + CUresult __err = EXPR; \ +@@ -62,4 +68,7 @@ struct DriverAPI { + static void* get_nvml_handle(); + }; + ++/*! \brief Get pointer corresponding to symbol in CUDA driver library */ ++void* get_symbol(const char* symbol, int cuda_version = DEFAULT_CUDA_VERSION); ++ + } // namespace c10::cuda +diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu +index 438624f4bc0..992e415db1b 100644 +--- a/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu ++++ b/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu +@@ -6,6 +6,7 @@ + + #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) + #include ++#include + #endif + + #if defined(CUDART_VERSION) && CUDART_VERSION >= 12030 diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-nccl-test-env.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-nccl-test-env.patch new file mode 100644 index 00000000000..92f5af4fecb --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-nccl-test-env.patch @@ -0,0 +1,44 @@ +Avoid failures like +> Error : no algorithm/protocol available for function Broadcast with datatype ncclInt8. NCCL_ALGO was set to NVLS. + +See https://github.com/pytorch/pytorch/pull/163063 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py +index f5a35c98d13..734afc17c69 100644 +--- a/test/distributed/test_c10d_nccl.py ++++ b/test/distributed/test_c10d_nccl.py +@@ -3163,19 +3163,24 @@ class NcclErrorHandlingTest(MultiProcessTestCase): + class NcclUserBufferRegistrationTest(MultiProcessTestCase): + def setUp(self): + super().setUp() +- # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests +- # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected. +- os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1" + nccl_debug_file = tempfile.NamedTemporaryFile() +- os.environ["NCCL_ALGO"] = "NVLS" +- os.environ["NCCL_DEBUG"] = "INFO" +- os.environ["NCCL_DEBUG_SUBSYS"] = "NVLS" ++ nccl_env = { ++ # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests ++ # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected. ++ "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", ++ "NCCL_ALGO": "NVLS", ++ "NCCL_DEBUG": "INFO", ++ "NCCL_DEBUG_SUBSYS": "NVLS", ++ "NCCL_DEBUG_FILE": nccl_debug_file.name, ++ } + if torch.cuda.nccl.version() >= (2, 24, 3): +- os.environ["NCCL_DEBUG_SUBSYS"] = "REG" +- os.environ["NCCL_DEBUG_FILE"] = nccl_debug_file.name ++ nccl_env["NCCL_DEBUG_SUBSYS"] = "REG" ++ self.env_patcher = mock.patch.dict(os.environ, nccl_env) ++ self.env_patcher.start() + self._spawn_processes() + + def tearDown(self): ++ self.env_patcher.stop() + super().tearDown() + try: + os.remove(self.file_name) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ck_blas_library_cpu.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ck_blas_library_cpu.patch new file mode 100644 index 00000000000..96a8cceaaae --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ck_blas_library_cpu.patch @@ -0,0 +1,58 @@ +From 8cdb9adc05d77bb1d65fc233b780860b893c8d17 Mon Sep 17 00:00:00 2001 +From: Yichen Yan +Date: Mon, 17 Mar 2025 17:45:45 +0000 +Subject: [PATCH] do not run `test_ck_blas_library` on cpu (#148316) + +Fix on non-rocm: + +``` +root@e01-tw-ue5g2g3sap6:~/pytorch/test# python test_linalg.py TestLinalgCPU.test_ck_blas_library_cpu +E +====================================================================== +ERROR: test_ck_blas_library_cpu (__main__.TestLinalgCPU) +---------------------------------------------------------------------- +Traceback (most recent call last): + File "/root/pytorch/torch/testing/_internal/common_utils.py", line 3108, in wrapper + method(*args, **kwargs) + File "/root/pytorch/torch/testing/_internal/common_device_type.py", line 480, in instantiated_test + raise rte + File "/root/pytorch/torch/testing/_internal/common_device_type.py", line 460, in instantiated_test + result = test(self, **param_kwargs) + File "/root/pytorch/torch/testing/_internal/common_device_type.py", line 1242, in dep_fn + return fn(slf, *args, **kwargs) + File "/root/pytorch/torch/testing/_internal/common_utils.py", line 1981, in _fn + fn(*args, **kwargs) + File "/root/pytorch/test/test_linalg.py", line 8621, in test_ck_blas_library + torch.backends.cuda.preferred_blas_library('ck') + File "/root/pytorch/torch/backends/cuda/__init__.py", line 258, in preferred_blas_library + torch._C._set_blas_preferred_backend(_BlasBackends[backend]) +RuntimeError: Cannot set preferred backend to Ck if PyTorch has not been compiled for ROCm. + +To execute this test, run the following from the base repo dir: + python test/test_linalg.py TestLinalgCPU.test_ck_blas_library_cpu + +This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 + +---------------------------------------------------------------------- +Ran 1 test in 0.346s + +FAILED (errors=1) +``` +Pull Request resolved: https://github.com/pytorch/pytorch/pull/148316 +Approved by: https://github.com/jeffdaily +--- + test/test_linalg.py | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/test/test_linalg.py b/test/test_linalg.py +index caa25cb13dde..0cece1a1e1a6 100644 +--- a/test/test_linalg.py ++++ b/test/test_linalg.py +@@ -9127,6 +9127,7 @@ def test_preferred_blas_library(self): + self.assertEqual(out1, out2) + self.assertEqual(out_ref, out2.cpu()) + ++ @onlyCUDA + @skipCUDAIfNotRocm + @unittest.skipIf(not blaslt_supported_device(), "blasLt not supported on current device") + @setBlasBackendsToDefaultFinally diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_fsdp_ep.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_fsdp_ep.patch new file mode 100644 index 00000000000..4f3461fd289 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_fsdp_ep.patch @@ -0,0 +1,38 @@ +From 9e07673deb212c87b1c6fea23799a97474c476ed Mon Sep 17 00:00:00 2001 +From: Kanya-Mo <167922169+Kanya-Mo@users.noreply.github.com> +Date: Fri, 8 Aug 2025 22:36:42 +0000 +Subject: [PATCH] Fix test_fsdp_ep.py due to _MeshEnv API change (#158695) + +#132339 changed parent/child mesh related APIs from _MeshEnv. UT TestFSDPWithEP.test_e2e still uses old APIs and will fail: +``` +File "/home/kanya/pytorch/test/distributed/checkpoint/e2e/test_fsdp_ep.py", line 77, in test_e2e + mesh_fsdp_ep = _mesh_resources.create_child_mesh(mesh_fsdp_tp, ("dp",)) +AttributeError: '_MeshEnv' object has no attribute 'create_child_mesh' + +To execute this test, run the following from the base repo dir: + python test/distributed/checkpoint/e2e/test_fsdp_ep.py TestFSDPWithEP.test_e2e + +This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0. Did you mean: 'create_sub_mesh'? +``` + +Pull Request resolved: https://github.com/pytorch/pytorch/pull/158695 +Approved by: https://github.com/Skylion007, https://github.com/nWEIdia +--- + test/distributed/checkpoint/e2e/test_fsdp_ep.py | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/test/distributed/checkpoint/e2e/test_fsdp_ep.py b/test/distributed/checkpoint/e2e/test_fsdp_ep.py +index 7489317035b9..51d4b3e99537 100644 +--- a/test/distributed/checkpoint/e2e/test_fsdp_ep.py ++++ b/test/distributed/checkpoint/e2e/test_fsdp_ep.py +@@ -73,8 +73,8 @@ def test_e2e(self): + self.device_type, (2, 4), mesh_dim_names=("dp", "tp") + ) + # TODO: we are using an internal API atm. Change to a publich API once it is ready. +- mesh_fsdp_ep = _mesh_resources.create_child_mesh(mesh_fsdp_tp, ("dp",)) +- del _mesh_resources.child_to_parent_mapping[mesh_fsdp_ep] ++ mesh_fsdp_ep = _mesh_resources.create_sub_mesh(mesh_fsdp_tp, ("dp",), [(0,)]) ++ del _mesh_resources.child_to_root_mapping[mesh_fsdp_ep] + + mesh_fsdp = init_device_mesh(self.device_type, (8,)) + for i, l in enumerate(model.second.ep_layers): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ir_count.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ir_count.patch new file mode 100644 index 00000000000..b595cc05902 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_ir_count.patch @@ -0,0 +1,26 @@ +From acd0873d3b3378420fd81dbf68b31f503219e524 Mon Sep 17 00:00:00 2001 +From: Nikita Shulga +Date: Fri, 23 May 2025 13:04:47 -0700 +Subject: [PATCH] [CI] Fix `TestDynamoTimed.test_ir_count` for 3.12 (#154268) + +Python-3.12 emits the same bytecode as 3.13 for code in question +Pull Request resolved: https://github.com/pytorch/pytorch/pull/154268 +Approved by: https://github.com/clee2000, https://github.com/atalman +ghstack dependencies: #154237 +--- + test/dynamo/test_utils.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py +index 595e9dc02fd3..b20713d9ecf4 100644 +--- a/test/dynamo/test_utils.py ++++ b/test/dynamo/test_utils.py +@@ -481,7 +481,7 @@ def test_ir_count(self): + (3, 9): (10, 6), + (3, 10): (10, 6), + (3, 11): (10, 6), +- (3, 12): (10, 6), ++ (3, 12): (11, 7), + (3, 13): (11, 7), + }[version] + diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_torchinductor_dynamic-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_torchinductor_dynamic-tests.patch new file mode 100644 index 00000000000..2733bad2729 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_fix-test_torchinductor_dynamic-tests.patch @@ -0,0 +1,93 @@ +Fix failures in test_torchinductor_dynamic_shapes which disappear when running the test individually. +> RuntimeError: Tried to register an operator (test::foo(Tensor x) -> (Tensor, Tensor)) with the same name and overload name multiple times. Each overload's schema should only be registered with a single call to def(). Duplicate registration: registered at /dev/null:203. Original registration: registered at /dev/null:488 + +Last one doesn't only fail on ROCM with: +> AssertionError: expected to fail, but actually passed + + +See https://github.com/pytorch/pytorch/issues/154216 + +Author: Alexander Grund (TU Dresden) + +--- a/test/inductor/test_torchinductor_dynamic_shapes.py ++++ b/test/inductor/test_torchinductor_dynamic_shapes.py +@@ -367,7 +367,9 @@ class TestInductorDynamic(TestCase): + @torch._dynamo.config.patch(capture_scalar_outputs=True) + @torch._inductor.config.patch(implicit_fallbacks=True) + def test_item_to_inputs_kernel_nobreak(self, device): +- @torch.library.custom_op("test::foo", mutates_args=()) ++ @torch.library.custom_op( ++ "test_item_to_inputs_kernel_nobreak::foo", mutates_args=() ++ ) + def foo(x: torch.Tensor, y: int) -> torch.Tensor: + return x.clone() + +@@ -378,7 +380,7 @@ class TestInductorDynamic(TestCase): + @torch.compile(fullgraph=True) + def f(x, r): + y = x.item() +- return torch.ops.test.foo(r, y) ++ return torch.ops.test_item_to_inputs_kernel_nobreak.foo(r, y) + + f(torch.tensor([3], device=device), torch.randn(10, device=device)) + +@@ -440,11 +442,13 @@ class TestInductorDynamic(TestCase): + ) + @torch._inductor.config.patch(implicit_fallbacks=True) + def test_unbacked_save_for_backwards(self, device) -> None: +- @torch.library.custom_op("_test::_cat", mutates_args=()) ++ @torch.library.custom_op( ++ "test_unbacked_save_for_backwards::_cat", mutates_args=() ++ ) + def _cat(t: torch.Tensor, ds: list[int]) -> torch.Tensor: + return t * t.new_ones([sum(ds)]) + +- @torch.library.register_fake("_test::_cat") ++ @torch.library.register_fake("test_unbacked_save_for_backwards::_cat") + def _cat_fake(t: torch.Tensor, ds: list[int]) -> torch.Tensor: + [torch._check_is_size(d) for d in ds] + return t.new_empty([sum(ds)]) +@@ -456,13 +460,13 @@ class TestInductorDynamic(TestCase): + return grad.sum(), None + + torch.library.register_autograd( +- "_test::_cat", ++ "test_unbacked_save_for_backwards::_cat", + _cat_backward, + setup_context=_cat_setup_context, + ) + + def fn(t, sizes): +- r = torch.ops._test._cat(t, sizes.tolist()) ++ r = torch.ops.test_unbacked_save_for_backwards._cat(t, sizes.tolist()) + return r * t + + t = torch.randn((), requires_grad=True, device=device) +@@ -476,6 +476,7 @@ class TestInductorDynamic(TestCase): + ).sum().backward() + self.assertEqual(t.grad, expect) + ++ @unittest.skip("Fails on CPU") + @torch._dynamo.config.patch(capture_scalar_outputs=True) + def test_unbacked_reduction(self, device): + expect_fail = ( +@@ -591,7 +595,9 @@ class TestInductorDynamic(TestCase): + ) + @torch._inductor.config.patch(implicit_fallbacks=True) + def test_multi_output_unbacked_custom_op(self, device): +- @torch.library.custom_op("test::foo", mutates_args=()) ++ @torch.library.custom_op( ++ "test_multi_output_unbacked_custom_op::foo", mutates_args=() ++ ) + def foo(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + return torch.empty(2, device=x.device), torch.empty(3, device=x.device) + +@@ -603,7 +609,7 @@ class TestInductorDynamic(TestCase): + + @torch.compile(fullgraph=True) + def f(x): +- a, b = torch.ops.test.foo(x) ++ a, b = torch.ops.test_multi_output_unbacked_custom_op.foo(x) + return a.sum() + b.sum() + + f(torch.tensor([3], device=device)) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-sum-reduction-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-sum-reduction-test.patch new file mode 100644 index 00000000000..431c34aadb7 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-sum-reduction-test.patch @@ -0,0 +1,53 @@ +Fix accuracy issues with at least A100 GPUs for sum reduction tests +See https://github.com/pytorch/pytorch/issues/164249 + +> FAIL [1.381s]: test_reduction_fns_name_sum_float16 (__main__.CooperativeReductionTests.test_reduction_fns_name_sum_float16) +> Greatest absolute difference: 0.125 at index (0,) (up to 1e-05 allowed) +> Greatest relative difference: 0.0017375946044921875 at index (0,) (up to 0.001 allowed) + +FAIL [1.290s]: test_reduction_fns_name_sum_float32 (__main__.CooperativeReductionTests.test_reduction_fns_name_sum_float32) +> Greatest absolute difference: 0.000213623046875 at index (0,) (up to 1e-05 allowed) +> Greatest relative difference: 2.9593741146527464e-06 at index (0,) (up to 1.3e-06 allowed) + + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py +index 469ceec2e1b..07adc0e7e7e 100644 +--- a/test/inductor/test_cooperative_reductions.py ++++ b/test/inductor/test_cooperative_reductions.py +@@ -57,11 +57,11 @@ class CooperativeReductionTests(TestCase): + torch._inductor.metrics.generated_kernel_count = 0 + torch._dynamo.reset() + +- def run_and_check(self, fn, args, *, expect_kernel_count=1): ++ def run_and_check(self, fn, args, *, expect_kernel_count=1, atol=None, rtol=None): + expected = fn(*args) + fn = torch.compile(fn, fullgraph=True) + result, (source_code,) = run_and_get_code(fn, *args) +- self.assertEqual(result, expected) ++ self.assertEqual(result, expected, atol=atol, rtol=rtol) + if "@triton_heuristics.fixed_config" in source_code: + self.assertIn("cooperative_reduction_grid", source_code) + else: +@@ -91,13 +91,19 @@ class CooperativeReductionTests(TestCase): + def test_reduction_fns(self, name, dtype): + if IS_SM89 and dtype == torch.float64 and name in ["std", "var_mean"]: + raise unittest.SkipTest("Timeouts on SM89") ++ if name == "sum" and dtype == torch.float16: ++ tol_args = {"atol": 0.125, "rtol": 1.8e-3} ++ elif name == "sum" and dtype == torch.float32: ++ tol_args = {"atol": 2.2e-4, "rtol": 3e-6} ++ else: ++ tol_args = {} + + def fn(x, y): + return reduction_fn(x + y, dim=-1) + + reduction_fn = getattr(torch, name) + args = [torch.randn(1, 1024**2, device="cuda", dtype=dtype) for _ in range(2)] +- self.run_and_check(fn, args) ++ self.run_and_check(fn, args, **tol_args) + + def test_bool_reduction_fns(self): + def fn(x, y): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-test_freeze_conv_relu_fusion.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-test_freeze_conv_relu_fusion.patch new file mode 100644 index 00000000000..aee2499875f --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_increase-tolerance-for-test_freeze_conv_relu_fusion.patch @@ -0,0 +1,23 @@ +Avoid failures in test_jit, test_jit_legacy, test_jit_profiling + +> Mismatched elements: 7 / 30 (23.3%) +> Greatest absolute difference: 3.053247928619385e-05 at index (1, 1, 0, 0, 0) (up to 1e-05 allowed) +> Greatest relative difference: 0.0004548609140329063 at index (3, 1, 0, 0, 0) (up to 1.3e-06 allowed) + +See https://github.com/pytorch/pytorch/issues/164249 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py +index 7da41f0cc71..e1b03524743 100644 +--- a/test/jit/test_freezing.py ++++ b/test/jit/test_freezing.py +@@ -3032,7 +3032,7 @@ class TestFrozenOptimizations(JitTestCase): + frozen_mod.graph + ) + +- self.assertEqual(mod_eager(inp), frozen_mod(inp)) ++ self.assertEqual(mod_eager(inp), frozen_mod(inp), atol=3.1e-5, rtol=5e-4) + + @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN") + def test_freeze_conv_relu_fusion_not_forward(self): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_init-cutlass-include-dirs.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_init-cutlass-include-dirs.patch new file mode 100644 index 00000000000..09482734247 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_init-cutlass-include-dirs.patch @@ -0,0 +1,24 @@ +Init cutlass when getting include directories. +Otherwise config.cuda.cutlass_dir will be empty. + +Author: Alexander Grund (TU Dresden) + +diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py +index dc9f5c25365..17895251eb5 100644 +--- a/torch/_inductor/codecache.py ++++ b/torch/_inductor/codecache.py +@@ -2824,10 +2824,14 @@ def _cuda_compiler() -> Optional[str]: + + + def _cutlass_include_paths() -> list[str]: ++ from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass + if config.is_fbcode(): + from libfb.py import parutil + + cutlass_path = parutil.get_dir_path("cutlass-3-headers") ++ elif not try_import_cutlass(): ++ log.warning("CUTLASS not available, not adding include paths") ++ return [] + else: + cutlass_path = config.cuda.cutlass_dir + return [ diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_keep-CMAKE_PREFIX_PATH-in-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_keep-CMAKE_PREFIX_PATH-in-test.patch new file mode 100644 index 00000000000..2234c5d9e76 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_keep-CMAKE_PREFIX_PATH-in-test.patch @@ -0,0 +1,27 @@ +> pytorch-v2.7.1/test/inductor/test_aot_inductor_package.py", line 242, in test_compile_after_package +> self.assertTrue(so_path.exists()) +> AssertionError: False is not true + +Caused by: +``` +/software/binutils/2.42-GCCcore-13.3.0/bin/ld: cannot find -labsl::utility: No such file or directory +/software/binutils/2.42-GCCcore-13.3.0/bin/ld: cannot find -labsl::variant: No such file or directory +collect2: error: ld returned 1 exit status +``` +See https://github.com/pytorch/pytorch/pull/161907 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py +index 28e01a40e9d..f281f2de938 100644 +--- a/test/inductor/test_aot_inductor_package.py ++++ b/test/inductor/test_aot_inductor_package.py +@@ -229,7 +229,7 @@ class TestAOTInductorPackage(TestCase): + # Create a build directory to run cmake + build_path.mkdir() + custom_env = os.environ.copy() +- custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent) ++ custom_env["CMAKE_PREFIX_PATH"] = ":".join([str(Path(torch.__file__).parent)] + os.environ.get("CMAKE_PREFIX_PATH", "").split(":")) + subprocess.run( + ["cmake", ".."], + cwd=build_path, diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-faulty-close.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-faulty-close.patch new file mode 100644 index 00000000000..c0a89a46d9a --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-faulty-close.patch @@ -0,0 +1,29 @@ +Avoid closing random file handles in Inductor + +The `close` call closes random file handles. +In some tests this seems to close "fd=1", i.e. stdout. +Sebsequent writes/print then fails with +> OSError: [Errno 9] Bad file descriptor + +See https://github.com/pytorch/pytorch/pull/169065 + +Author: Alexander Grund (TU Dresden) + +diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py +--- a/torch/_inductor/autotune_process.py ++++ b/torch/_inductor/autotune_process.py +@@ -926,14 +926,6 @@ class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest): + *self.extra_args, + ) + +- def cleanup_run_fn(self) -> None: +- if self.DLL is not None: +- """ +- Check close attr due to it crash on Windows. +- """ +- if hasattr(self.DLL, "close"): +- self.DLL.close() +- + def __str__(self) -> str: + return f"{self.kernel_name=}" + diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-test_close_multi_pg_unordered.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-test_close_multi_pg_unordered.patch new file mode 100644 index 00000000000..a664ff7d4ce --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_remove-test_close_multi_pg_unordered.patch @@ -0,0 +1,47 @@ +Remove a test that fails upstream too. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py +index f5a35c98d13..c8db144b234 100644 +--- a/test/distributed/test_c10d_nccl.py ++++ b/test/distributed/test_c10d_nccl.py +@@ -758,38 +758,6 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase): + with self.assertRaises(dist.DistBackendError): + pg.allreduce([t]) + +- @requires_nccl() +- @skip_but_pass_in_sandcastle_if( +- torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs" +- ) +- def test_close_multi_pg_unordered(self): +- store = c10d.FileStore(self.file_name, self.world_size) +- pg = self._create_process_group_nccl(store, self.opts()) +- device = self.rank_to_GPU[self.rank][0] +- t = torch.rand(10, 10, device=device) +- # First allreduce to initialize default PG's communicator. +- pg.allreduce(t).wait() +- new_pg1 = c10d.new_group([0, 1]) +- new_pg2 = c10d.new_group([0, 1]) +- if self.rank == 0 or self.rank == 1: +- t1 = torch.rand(10, 10, device=device) +- t2 = torch.rand(10, 10, device=device) +- new_pg1.allreduce(t1).wait() +- new_pg2.allreduce(t2).wait() +- if self.rank == 0: +- dist.destroy_process_group(new_pg2) +- # force destruction of pg2 first +- del new_pg2 +- dist.destroy_process_group(new_pg1) +- del new_pg1 +- if self.rank == 1: +- c10d.destroy_process_group(new_pg1) +- # force destruction of pg1 first +- del new_pg1 +- dist.destroy_process_group(new_pg2) +- del new_pg2 +- dist.destroy_process_group() +- + @requires_nccl() + @skip_but_pass_in_sandcastle_if( + torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs" diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_serialize-test_host_memory_stats.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_serialize-test_host_memory_stats.patch new file mode 100644 index 00000000000..259033e0d64 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_serialize-test_host_memory_stats.patch @@ -0,0 +1,30 @@ +Fix test_cuda.py TestCuda.test_host_memory_stats +> AssertionError: Scalars are not equal! +> Expected 50333384 but got 0. + + +From 7abca8cebac9e399151af771233ee2f5d202c5e6 Mon Sep 17 00:00:00 2001 +From: eqy +Date: Thu, 1 May 2025 00:53:15 +0000 +Subject: [PATCH] Decorate `test_host_memory_stats` with `@serialTest` + (#152454) + +Seems to need it as it is expecting only its allocation behavior to be visible, to address #152422 +Pull Request resolved: https://github.com/pytorch/pytorch/pull/152454 +Approved by: https://github.com/Skylion007 +--- + test/test_cuda.py | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/test/test_cuda.py b/test/test_cuda.py +index 93a10072d832..c74f099358f3 100644 +--- a/test/test_cuda.py ++++ b/test/test_cuda.py +@@ -165,6 +165,7 @@ def test_pinned_memory_with_cudaregister_multithread(self): + for thread in threads: + thread.join() + ++ @serialTest + def test_host_memory_stats(self): + # Helper functions + def empty_stats(): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-NCCL-tests-without-GPUs.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-NCCL-tests-without-GPUs.patch new file mode 100644 index 00000000000..f658ecc7b89 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-NCCL-tests-without-GPUs.patch @@ -0,0 +1,34 @@ +Some tests that require NCCL also use GPUs. Skip those tests when none are available. + +Author: Alexander Grund (TU Dresden) + +diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py +index 2a8fc04265c..f62678656d0 100644 +--- a/torch/testing/_internal/common_distributed.py ++++ b/torch/testing/_internal/common_distributed.py +@@ -43,6 +43,7 @@ from torch.testing._internal.common_utils import ( + TEST_WITH_TSAN, + TestCase, + run_tests, ++ TEST_CUDA, + TEST_HPU, + TEST_XPU, + ) +@@ -327,6 +328,8 @@ def requires_gloo(): + + + def requires_nccl_version(version, msg): ++ if not TEST_CUDA: ++ return skip_but_pass_in_sandcastle(TEST_SKIPS["no_cuda"].message) + if not c10d.is_nccl_available(): + return skip_but_pass_in_sandcastle( + "c10d was not compiled with the NCCL backend", +@@ -339,6 +342,8 @@ def requires_nccl_version(version, msg): + + + def requires_nccl(): ++ if not TEST_CUDA: ++ return skip_but_pass_in_sandcastle(TEST_SKIPS["no_cuda"].message) + return skip_but_pass_in_sandcastle_if( + not c10d.is_nccl_available(), + "c10d was not compiled with the NCCL backend", diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-TestFP8Lowering.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-TestFP8Lowering.patch new file mode 100644 index 00000000000..b13c657caf4 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-TestFP8Lowering.patch @@ -0,0 +1,20 @@ +The test fails also with the official PyPI package: +> torch/_inductor/select_algorithm.py:1869] [0/0] AssertionError: Input shapes should have M >= 16, N >= 16 and K >= 32 +> ... +> torch._inductor.exc.InductorError: CompilationError: at 56:18: + + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py +--- a/test/inductor/test_fp8.py ++++ b/test/inductor/test_fp8.py +@@ -411,6 +411,7 @@ class TestFP8Types(TestCase): + ) + + ++@unittest.skip("Fails on H100s") + @instantiate_parametrized_tests + class TestFP8Lowering(TestCase): + @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM") +a \ No newline at end of file diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-max_autotune-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-max_autotune-tests.patch new file mode 100644 index 00000000000..d17ceaa6c71 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-max_autotune-tests.patch @@ -0,0 +1,60 @@ +Skip ~17 testcases in inductor/test_max_autotune.py which fail on H100 GPUs. +See https://github.com/pytorch/pytorch/issues/160305 + +> torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index +OR +> Mismatched elements: 41585 / 41664 (99.8%) +> Greatest absolute difference: 155.375 at index (9, 206) (up to 0.01 allowed) +> Greatest relative difference: 1913.0 at index (42, 58) (up to 0.01 allowed) + +Tests are generated and names look like +> test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True +> test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py +index 741353fdbf5..49656bd2062 100644 +--- a/test/inductor/test_max_autotune.py ++++ b/test/inductor/test_max_autotune.py +@@ -26,7 +26,7 @@ from torch._inductor.select_algorithm import ( + AlgorithmSelectorCache, + TritonTemplateCaller, + ) +-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8 ++from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8, SM90OrLater + from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + IS_WINDOWS, +@@ -221,6 +221,7 @@ class TestMaxAutotune(TestCase): + with config.patch({"max_autotune": True, "autotune_in_subproc": True}): + torch.compile(mm, dynamic=dynamic)(a, b) + ++ @unittest.skipIf(SM90OrLater, "Fails on H100+") + @unittest.skipIf( + not has_triton_tma_device(), "Need device-side TMA support in Triton" + ) +@@ -394,6 +395,7 @@ class TestMaxAutotune(TestCase): + Y = addmm(x, a, b) + torch.testing.assert_close(Y_compiled, Y, atol=1e-2, rtol=1e-2) + ++ @unittest.skipIf(SM90OrLater, "Fails on H100+") + @unittest.skipIf( + not has_triton_tma_device(), "Need device-side TMA support in Triton" + ) +@@ -999,6 +1001,7 @@ class TestMaxAutotune(TestCase): + act = f(x, y) + torch.testing.assert_close(act, ref, atol=2e-2, rtol=1e-2) + ++ @unittest.skipIf(SM90OrLater, "Fails on H100+") + def test_non_contiguous_input_addmm(self): + b = torch.randn((768), dtype=torch.bfloat16, device=GPU_TYPE) + x = rand_strided( +@@ -1372,6 +1375,7 @@ class TestPrologueFusion(TestCase): + .run(code[0]) + ) + ++ @unittest.skip("Fails in various setups, see issue 154228") + @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM") + @unittest.skipIf( + not PLATFORM_SUPPORTS_FP8, diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-schedule-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-schedule-test.patch new file mode 100644 index 00000000000..be51dea09cc --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-failing-schedule-test.patch @@ -0,0 +1,21 @@ + +test_schedule_with_native_zero_bubble_ScheduleClass0 fails upstream: +https://github.com/pytorch/pytorch/issues/156088 + +Disable just this single test + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/pipelining/test_schedule_multiproc.py b/test/distributed/pipelining/test_schedule_multiproc.py +index 8491881f7fe..cbe63df2a77 100644 +--- a/test/distributed/pipelining/test_schedule_multiproc.py ++++ b/test/distributed/pipelining/test_schedule_multiproc.py +@@ -519,7 +519,7 @@ class ScheduleTest(MultiProcContinousTest): + + @requires_nccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs") +- @parametrize("ScheduleClass", [ScheduleWithW, ScheduleInterleavedZeroBubble]) ++ @parametrize("ScheduleClass", [ScheduleInterleavedZeroBubble]) + def test_schedule_with_native_zero_bubble(self, ScheduleClass): + print(ScheduleClass) + if ScheduleClass is ScheduleInterleavedZeroBubble: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_benchmark_on_non_zero_device.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_benchmark_on_non_zero_device.patch new file mode 100644 index 00000000000..74029fdabc1 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_benchmark_on_non_zero_device.patch @@ -0,0 +1,21 @@ +inductor/test_benchmark_fusion.py BenchmarkingTest.test_benchmark_on_non_zero_device fails with +> self.assertTrue(hit_count > 0) +> AssertionError: False is not true + +Related: https://github.com/pytorch/pytorch/issues/160514 + +Author: Alexander Grund (TU Dresden) + +--- a/test/inductor/test_benchmark_fusion.py ++++ b/test/inductor/test_benchmark_fusion.py +@@ -190,9 +190,7 @@ if HAS_CUDA: + copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionCudaTest, "cuda") + + class BenchmarkingTest(TestCase): +- @unittest.skipIf( +- torch.cuda.device_count() < 2, "The test need at least 2 devices" +- ) ++ @unittest.skip("Mocking fails") + def test_benchmark_on_non_zero_device(self): + hit_count = 0 + with torch.cuda.device("cuda:0"): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_data_parallel_rnn.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_data_parallel_rnn.patch new file mode 100644 index 00000000000..5b81095e931 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_data_parallel_rnn.patch @@ -0,0 +1,28 @@ +Failing upstream too: https://github.com/pytorch/pytorch/issues/162745 +> /PyTorch/2.7.1/foss-2024a-CUDA-12.6.0/pytorch-v2.7.1/test/distributed/test_data_parallel.py", line 99, in test_data_parallel_rnn +> self.assertTrue(p1.allclose(p2)) +> AssertionError: False is not true + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py +index 26f64df90d9..c25cc6673c3 100644 +--- a/test/distributed/test_data_parallel.py ++++ b/test/distributed/test_data_parallel.py +@@ -6,6 +6,7 @@ import io + from collections import OrderedDict + from copy import deepcopy + from itertools import product ++import unittest + + import torch + import torch.nn.functional as F +@@ -63,7 +64,7 @@ class TestDataParallel(TestCase): + + gradcheck(fn, (m.t_rg,)) + +- @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported") ++ @unittest.skip("Fails") + def test_data_parallel_rnn(self): + class TestModule(torch.nn.Module): + def __init__(self) -> None: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch new file mode 100644 index 00000000000..bb10b104456 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_gds_fails_in_ci.patch @@ -0,0 +1,16 @@ +Skip a test meant for CI only. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_cuda.py b/test/test_cuda.py +index 3726c377970..78b5e8c8af9 100644 +--- a/test/test_cuda.py ++++ b/test/test_cuda.py +@@ -3633,6 +3633,7 @@ print(f"{{r1}}, {{r2}}") + x = torch.cuda.device_count() + self.assertEqual(f"{x}, 1", r) + ++ @unittest.skip("Not applicable") + def test_gds_fails_in_ci(self): + if IS_WINDOWS or TEST_WITH_ROCM: + error_msg = "is not supported on this platform" diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_lowering_one_shot_all_reduce.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_lowering_one_shot_all_reduce.patch new file mode 100644 index 00000000000..214615bb589 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_lowering_one_shot_all_reduce.patch @@ -0,0 +1,26 @@ +Test fails also with PYPI version +> AssertionError: 'one_shot_all_reduce' not found in '# AOT ID[...] + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py +index 34b8ed5a7b1..bf0fbe0e7f2 100644 +--- a/test/distributed/test_symmetric_memory.py ++++ b/test/distributed/test_symmetric_memory.py +@@ -1,7 +1,7 @@ + # Owner(s): ["module: c10d"] + + import os +-from unittest import skipIf ++from unittest import skipIf, skip + + import torch + import torch.distributed as dist +@@ -981,6 +981,7 @@ class LoweringTest(MultiProcessTestCase): + + torch._inductor.config._collective.auto_select = True + ++ @skip("Fails with PyPI too") + @skipIfRocm # requires registered-buffer support + @skip_if_lt_x_gpu(2) + @fresh_inductor_cache() diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch new file mode 100644 index 00000000000..e745a728208 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_mixed_mm_exhaustive_dtypes.patch @@ -0,0 +1,14 @@ +Test fails upstream too, see https://github.com/pytorch/pytorch/issues/147853 +> RuntimeError: Expected to find ".to(" but did not find it + +Author: Alexander Grund (TU Dresden) +--- a/test/inductor/test_pattern_matcher.py ++++ b/test/inductor/test_pattern_matcher.py +@@ -389,6 +389,7 @@ class TestPatternMatcher(TestCase): + } + ) + @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu") ++ @unittest.skip("Fails") + def test_mixed_mm_exhaustive_dtypes(self): + def fn(a, b): + return torch.mm(a, b.to(a.dtype)) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_outside_linear_module_free.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_outside_linear_module_free.patch new file mode 100644 index 00000000000..79bdea43a4d --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_outside_linear_module_free.patch @@ -0,0 +1,26 @@ +Test failing with PYPI package too: +> self.assertTrue(cleared) +> AssertionError: False is not true + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py +index 7541bd3b9d8..d0cb310bec6 100644 +--- a/test/dynamo/test_misc.py ++++ b/test/dynamo/test_misc.py +@@ -10992,6 +10992,7 @@ fn + lambda mod: mod, + ) + ++ @unittest.skip("Unreliable") + def test_outside_linear_module_free(self): + # Compared to test_linear_module_free, the linear + # layer is not the code object that is directly compiled. +@@ -11026,6 +11026,7 @@ fn + gc.collect() + self.assertTrue(cleared) + ++ @unittest.skip("Unreliable") + def test_parameter_free(self): + def model_inp_ctr(): + param = torch.nn.Parameter(torch.randn(100, 100)) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_override-without-CUDA.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_override-without-CUDA.patch new file mode 100644 index 00000000000..4ec8ac594b7 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-test_override-without-CUDA.patch @@ -0,0 +1,33 @@ +This test fails during creation of the tests at startup: +> File "/var/lib/jenkins/workspace/test/test_overrides.py", line 683, in _simple_type_parser +> return torch.Stream() +> RuntimeError: CUDA error: CUDA driver version is insufficient for CUDA runtime version + +See https://github.com/pytorch/pytorch/pull/166625 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_overrides.py b/test/test_overrides.py +--- a/test/test_overrides.py ++++ b/test/test_overrides.py +@@ -10,9 +10,8 @@ import pickle + import collections + import unittest + import contextlib +-import os + +-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_CROSSREF, TEST_WITH_TORCHDYNAMO ++from torch.testing._internal.common_utils import TestCase, run_tests, TEST_CUDA, TEST_WITH_CROSSREF, TEST_WITH_TORCHDYNAMO + from torch.overrides import ( + handle_torch_function, + has_torch_function, +@@ -31,8 +30,7 @@ from torch.utils._pytree import tree_map + + Tensor = torch.Tensor + +-if os.getenv("ATEN_CPU_CAPABILITY") in ("default", "avx2"): +- # This test is not supported on ARM ++if not TEST_CUDA: + print( + "Skipping due to failing when cuda build runs on non cuda machine, " + + "see https://github.com/pytorch/pytorch/pull/150059 for example" diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch new file mode 100644 index 00000000000..ee60c76ddbc --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-SM90.patch @@ -0,0 +1,34 @@ +Avoid it failing on e.g. A100: + +> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721] RuntimeError: CUDA error: device-side assert triggered... +> [rank1]:E1022 09:55:08.823000 3580472 torch/testing/_internal/common_distributed.py:721] exiting process 1 with exit code: 10 +> ... +> :318: st_vec: block: [0,0,0], thread: [87,0,0] Assertion `false` failed. +> /pytorch-v2.7.1/torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h:318: st_vec: block: [0,0,0], thread: [88,0,0] Assertion `false` failed. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py +index 7410255d27a..603ea0b375b 100644 +--- a/test/distributed/test_c10d_nccl.py ++++ b/test/distributed/test_c10d_nccl.py +@@ -3367,7 +3367,7 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase): + @skip_if_rocm_multiprocess + def test_intra_node_comm_all_reduce(self): + from torch._C._distributed_c10d import _get_intra_node_comm_usage_counter +- from torch.testing._internal.common_cuda import SM80OrLater ++ from torch.testing._internal.common_cuda import SM90OrLater + + for peer in range(self.world_size): + if peer == self.rank: +@@ -3375,8 +3375,8 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase): + if not torch._C._cuda_canDeviceAccessPeer(self.rank, peer): + raise SkipTest("Test requires p2p access") + +- if not SM80OrLater: +- raise SkipTest("Test requires sm>=80") ++ if not SM90OrLater: ++ raise SkipTest("Test requires sm>=90") + + store = c10d.FileStore(self.file_name, self.world_size) + os.environ["ENABLE_INTRA_NODE_COMM"] = "1" diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-cc89.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-cc89.patch new file mode 100644 index 00000000000..de560c007da --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_skip-tests-requiring-cc89.patch @@ -0,0 +1,43 @@ +Avoid this error in a function called by those tests: +> RuntimeError: torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+ + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py +index 34b8ed5a7b1..04d3bb7a959 100644 +--- a/test/distributed/test_symmetric_memory.py ++++ b/test/distributed/test_symmetric_memory.py +@@ -1,7 +1,7 @@ + # Owner(s): ["module: c10d"] + + import os +-from unittest import skipIf, skip ++from unittest import skipIf, skip, skipUnless + + import torch + import torch.distributed as dist +@@ -19,7 +19,7 @@ from torch.distributed._symmetric_memory import ( + restride_A_for_fused_matmul_reduce_scatter, + restride_A_shard_for_fused_all_gather_matmul, + ) +-from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater ++from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM89OrLater, SM90OrLater + from torch.testing._internal.common_distributed import ( + MultiProcessTestCase, + requires_multicast_support, +@@ -458,6 +458,7 @@ class SymmetricMemoryTest(MultiProcessTestCase): + + @skipIfRocm + @skip_if_lt_x_gpu(2) ++ @skipUnless(SM89OrLater, "compute capability >= 8.9") + @parametrize("gather_dim", [0, 1]) + @parametrize( + "scale_mode", ["tensor-wise", "row-wise-replicated", "row-wise-sharded"] +@@ -576,6 +577,7 @@ class SymmetricMemoryTest(MultiProcessTestCase): + + @skipIfRocm + @skip_if_lt_x_gpu(2) ++ @skipUnless(SM89OrLater, "compute capability >= 8.9") + @parametrize("scatter_dim", [0, 1]) + @parametrize("rowwise", [True, False]) + def test_fused_scaled_matmul_reduce_scatter( diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch new file mode 100644 index 00000000000..6e8cdfb2d36 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch @@ -0,0 +1,27 @@ +When the GPUs use 64bit BARs the RPC module fails during the initialization with: +> E RuntimeError: In getBar1SizeOfGpu at tensorpipe/channel/cuda_gdr/context_impl.cc:242 "": No such file or directory + +This causes KeyboardInterrupt errors in distributed/rpc/test_share_memory + +See https://github.com/pytorch/pytorch/issues/159354 + +Author: Alexander Grund (TU Dresden) + +diff --git a/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc b/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc +index 182a04a..b26751e 100644 +--- a/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc ++++ b/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc +@@ -239,6 +239,13 @@ size_t getBar1SizeOfGpu(int gpuIdx) { + + struct stat bar1Stats; + int rv = ::stat(pciPath.c_str(), &bar1Stats); ++ if (rv < 0 && errno == ENOENT) { ++ // Some GPUs use 64 bit BARs using 2 slots each, ++ // so the BAR 0 spans slots 0 & 1 and BAR 1 is at slots 2 & 3 ++ TP_VLOG(5) << "GPU #" << gpuIdx << " might has 64 bit BARs"; ++ pciPath[pciPath.size() - 1] = '2'; ++ rv = ::stat(pciPath.c_str(), &bar1Stats); ++ } + TP_THROW_SYSTEM_IF(rv < 0, errno); + + return bar1Stats.st_size; diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch new file mode 100644 index 00000000000..c58d35aacaf --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_tolerance-test_partial_flat_weights.patch @@ -0,0 +1,23 @@ +Avoid failures in test_nn.py test_partial_flat_weights + +> Mismatched elements: 9 / 36 (25.0%) +> Greatest absolute difference: 3.013014793395996e-05 at index (2, 0, 4) (up to 1e-05 allowed) +> Greatest relative difference: 0.0030790010932832956 at index (2, 0, 4) (up to 1.3e-06 allowed) + +See https://github.com/pytorch/pytorch/issues/163072 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_nn.py b/test/test_nn.py +index 30609247cb1..02a2d3a7f3a 100644 +--- a/test/test_nn.py ++++ b/test/test_nn.py +@@ -4299,7 +4299,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""") + inp = inp.cuda() + # otherwise, subsequent warnings will be hidden, and further tests rely on them + warnings.simplefilter("always") +- self.assertEqual(m(inp)[0].cpu(), out_expected[0]) ++ self.assertEqual(m(inp)[0].cpu(), out_expected[0], atol=3.1e-5, rtol=3.1e-3) + + @unittest.skipIf(not TEST_CUDNN, "needs cudnn") + @set_default_dtype(torch.double) diff --git a/easybuild/easyconfigs/p/pyzstd/pyzstd-0.16.2-GCCcore-13.3.0.eb b/easybuild/easyconfigs/p/pyzstd/pyzstd-0.16.2-GCCcore-13.3.0.eb new file mode 100644 index 00000000000..bee4110b7b5 --- /dev/null +++ b/easybuild/easyconfigs/p/pyzstd/pyzstd-0.16.2-GCCcore-13.3.0.eb @@ -0,0 +1,29 @@ +easyblock = 'PythonPackage' + +name = 'pyzstd' +version = '0.16.2' + +homepage = 'https://github.com/Rogdham/pyzstd' +description = """Pyzstd module provides classes and functions for compressing and decompressing data, +using Facebook's Zstandard (or zstd as short name) algorithm. + +The API style is similar to Python's bz2/lzma/zlib modules.""" + +toolchain = {'name': 'GCCcore', 'version': '13.3.0'} + +sources = [SOURCE_TAR_GZ] +checksums = ['179c1a2ea1565abf09c5f2fd72f9ce7c54b2764cf7369e05c0bfd8f1f67f63d2'] + +builddependencies = [ + ('binutils', '2.42'), +] + +dependencies = [ + ('Python', '3.12.3'), + ('zstd', '1.5.6'), +] + +# Use preinstalled zstd library +buildopts = installopts = '--config-settings="--build-option=--dynamic-link-zstd"' + +moduleclass = 'tools'