diff --git a/easybuild/easyconfigs/c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb new file mode 100644 index 000000000000..c1f3b9f096de --- /dev/null +++ b/easybuild/easyconfigs/c/cuDNN/cuDNN-9.0.0.312-CUDA-12.4.0.eb @@ -0,0 +1,46 @@ +name = 'cuDNN' +version = '9.0.0.312' +versionsuffix = '-CUDA-%(cudaver)s' +homepage = 'https://developer.nvidia.com/cudnn' +description = """The NVIDIA CUDA Deep Neural Network library (cuDNN) is +a GPU-accelerated library of primitives for deep neural networks.""" + +toolchain = SYSTEM + +# note: cuDNN is tied to specific to CUDA versions, +# see also https://docs.nvidia.com/deeplearning/cudnn/support-matrix/index.html#cudnn-cuda-hardware-versions +local_cuda_major = '12' + +source_urls = [ + 'https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-%(cudnnarch)s/' +] +sources = ['%%(namelower)s-linux-%%(cudnnarch)s-%%(version)s_cuda%s-archive.tar.xz' % local_cuda_major] +checksums = [{ + '%%(namelower)s-linux-ppc64le-%%(version)s_cuda%s-archive.tar.xz' % local_cuda_major: + 'b8ef6f249128e1985893a8787a21de35cb83ec47c6dc6fd1809061dd9a3ffb20', + '%%(namelower)s-linux-sbsa-%%(version)s_cuda%s-archive.tar.xz' % local_cuda_major: + '430fbf5b513c69e989b3a3a5a572369778ce0c214ce1259af6b935f9cab7dd54', + '%%(namelower)s-linux-x86_64-%%(version)s_cuda%s-archive.tar.xz' % local_cuda_major: + 'd3890e609d6530ee5b88ff95b60c8e6b1c1ec7fa966ec533925f20f896fcc630', +}] + +dependencies = [('CUDA', '12.4.0')] + +local_static_libs = [ + 'libcudnn_adv_static_v9.a', + 'libcudnn_cnn_static_v9.a', + 'libcudnn_engines_precompiled_static_v9.a', + 'libcudnn_engines_runtime_compiled_static_v9.a', + 'libcudnn_graph_static_v9.a', + 'libcudnn_heuristic_static_v9.a', + 'libcudnn_ops_static_v9.a', +] +sanity_check_paths = { + 'files': [ + 'include/cudnn.h', + 'lib64/libcudnn.%s' % SHLIB_EXT + ] + ['lib64/' + i for i in local_static_libs], + 'dirs': ['include', 'lib64'], +} + +moduleclass = 'numlib' diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb new file mode 100644 index 000000000000..bb1f766f4872 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2023b-CUDA-12.4.0.eb @@ -0,0 +1,225 @@ +name = 'PyTorch' +version = '2.3.0' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://pytorch.org/' +description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration. +PyTorch is a deep learning framework that puts Python first.""" + +toolchain = {'name': 'foss', 'version': '2023b'} + +source_urls = [GITHUB_RELEASE] +sources = ['%(namelower)s-v%(version)s.tar.gz'] +patches = [ + 'PyTorch-1.7.0_disable-dev-shm-test.patch', + 'PyTorch-1.12.1_add-hypothesis-suppression.patch', + 'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch', + 'PyTorch-1.12.1_fix-TestTorch.test_to.patch', + 'PyTorch-1.12.1_skip-test_round_robin.patch', + 'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch', + 'PyTorch-1.13.1_fix-protobuf-dependency.patch', + 'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch', + 'PyTorch-1.13.1_skip-failing-singular-grad-test.patch', + 'PyTorch-1.13.1_skip-tests-without-fbgemm.patch', + 'PyTorch-2.0.1_avoid-test_quantization-failures.patch', + 'PyTorch-2.0.1_fix-skip-decorators.patch', + 'PyTorch-2.0.1_fix-vsx-loadu.patch', + 'PyTorch-2.0.1_skip-failing-gradtest.patch', + 'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch', + 'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch', + 'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch', + 'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch', + 'PyTorch-2.1.0_remove-test-requiring-online-access.patch', + 'PyTorch-2.1.0_skip-diff-test-on-ppc.patch', + 'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch', + 'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch', + 'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch', + 'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch', + 'PyTorch-2.3.0_disable_test_linear_package_if_no_half_types_are_available.patch', + 'PyTorch-2.3.0_disable_DataType_dependent_test_if_tensorboard_is_not_available.patch', + 'PyTorch-2.3.0_fix-cpuinfo-bug-with-smt.patch', + 'PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch', + 'PyTorch-2.3.0_skip-test_init_from_local_shards.patch', + 'PyTorch-2.3.0_no-cuda-stubs-rpath.patch', + 'PyTorch-2.3.0_disable-gcc12-warning.patch', + 'PyTorch-2.3.0_fix-test_extension_backend-without-vectorization.patch', + 'PyTorch-2.3.0_fix-test_fine_tuning.patch', + 'PyTorch-2.3.0_disable_tests_which_need_network_download.patch', + 'PyTorch-2.3.0_avoid_caffe2_test_cpp_jit.patch', + 'PyTorch-2.3.0_fix_missing_masked_load_for_int_type.patch', + 'PyTorch-2.3.0_relax-test_unbacked_reduction.patch', + 'PyTorch-2.3.0_remove-fsspec-test.patch', + 'PyTorch-2.3.0_skip_test_var_mean_differentiable.patch', + 'PyTorch-2.3.0_skip_test_sdpa_nn_functional_scaled_dot_product_attention_cpu.patch', + 'PyTorch-2.3.0_fix-mkldnn-avx512-f32-bias.patch', + 'PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch', + 'PyTorch-2.6.0_show-test-duration.patch', + 'PyTorch-2.7.1_suport-64bit-BARs.patch', +] +checksums = [ + {'pytorch-v2.3.0.tar.gz': '69579513b26261bbab32e13b7efc99ad287fcf3103087f2d4fdf1adacd25316f'}, + {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'}, + {'PyTorch-1.12.1_add-hypothesis-suppression.patch': + 'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'}, + {'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch': + '1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'}, + {'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'}, + {'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'}, + {'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch': + '5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'}, + {'PyTorch-1.13.1_fix-protobuf-dependency.patch': + '8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'}, + {'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch': + 'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'}, + {'PyTorch-1.13.1_skip-failing-singular-grad-test.patch': + '72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'}, + {'PyTorch-1.13.1_skip-tests-without-fbgemm.patch': + '481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'}, + {'PyTorch-2.0.1_avoid-test_quantization-failures.patch': + '02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'}, + {'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'}, + {'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'}, + {'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'}, + {'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch': + '7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'}, + {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch': + '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'}, + {'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch': + '3793b4b878be1abe7791efcbd534774b87862cfe7dc4774ca8729b6cabb39e7e'}, + {'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch': + 'aef38adf1210d0c5455e91d7c7a9d9e5caad3ae568301e0ba9fc204309438e7b'}, + {'PyTorch-2.1.0_remove-test-requiring-online-access.patch': + '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'}, + {'PyTorch-2.1.0_skip-diff-test-on-ppc.patch': '394157dbe565ffcbc1821cd63d05930957412156cc01e949ef3d3524176a1dda'}, + {'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch': + '6298daf9ddaa8542850eee9ea005f28594ab65b1f87af43d8aeca1579a8c4354'}, + {'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch': + '5229ca88a71db7667a90ddc0b809b2c817698bd6e9c5aaabd73d3173cf9b99fe'}, + {'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch': + '7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'}, + {'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch': + 'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'}, + {'PyTorch-2.3.0_disable_test_linear_package_if_no_half_types_are_available.patch': + '23416f2d9d5226695ec3fbea0671e3650c655c19deefd3f0f8ddab5afa50f485'}, + {'PyTorch-2.3.0_disable_DataType_dependent_test_if_tensorboard_is_not_available.patch': + '0dcbdfde6752c3ff54c5376f521b4a742167669feb7f0f1d4e1d4d55f72b664f'}, + {'PyTorch-2.3.0_fix-cpuinfo-bug-with-smt.patch': + '29fb95d1dba070133b513de050febd328ed36905a73f1ca135dc633f16beafa4'}, + {'PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch': + '6f8eba5b546129ea975cda1a8a7098ca3245ad2b040a31a98807ee6d69cad0d4'}, + {'PyTorch-2.3.0_skip-test_init_from_local_shards.patch': + '90ed9c2870f57ee6dc032d00873a37e2217a2b92a13035ded1c25ad5306455f2'}, + {'PyTorch-2.3.0_no-cuda-stubs-rpath.patch': '7ba26824b5def7379cff02ae821a080698e6affea0da45bc846e9ecb89939cb1'}, + {'PyTorch-2.3.0_disable-gcc12-warning.patch': 'a8a624e1a2a5f4c82610173e50bd0f853e49bd5621b432f5aac689f9f6eb1514'}, + {'PyTorch-2.3.0_fix-test_extension_backend-without-vectorization.patch': + '36aa2d5ba175be17f4e996f4fb2d544fe477d4a0bd0644cd59a85063779afc8e'}, + {'PyTorch-2.3.0_fix-test_fine_tuning.patch': 'daa24801f3b2b5f76b639a14fba9a6ad84fe99ebed53401e217d02f94cfe48bf'}, + {'PyTorch-2.3.0_disable_tests_which_need_network_download.patch': + 'b7fd1a5135dfd4098cdc054182f7bf84a23ac98462a00477712182b5442da855'}, + {'PyTorch-2.3.0_avoid_caffe2_test_cpp_jit.patch': + '041adcd91d994b8c2ab57d227f081cd57e572c157117b37171e1eb8eb576f8fc'}, + {'PyTorch-2.3.0_fix_missing_masked_load_for_int_type.patch': + 'aa6ff764f3f7bf84372a8a257fe1b4ae6dc4b9744ad35f0f9015f2696c62a41e'}, + {'PyTorch-2.3.0_relax-test_unbacked_reduction.patch': + 'c822f084bd97b6c76bea692e3a4664e227b3aea57c80e576a841943877085b77'}, + {'PyTorch-2.3.0_remove-fsspec-test.patch': '09be192401013cd8cd66add9d6565ac3e879e004d77e61145f826b768267ff61'}, + {'PyTorch-2.3.0_skip_test_var_mean_differentiable.patch': + '9703fd0f1fca8916f6d79d83e9a7efe8e3f717362a5fdaa8f5d9da90d0c75018'}, + {'PyTorch-2.3.0_skip_test_sdpa_nn_functional_scaled_dot_product_attention_cpu.patch': + '7955f2655db3da18606574fdcbc5990be24098f49ad1db5e86ea756ea1cc506f'}, + {'PyTorch-2.3.0_fix-mkldnn-avx512-f32-bias.patch': + 'ee07d21c3ac7aeb0bd0e39507b18a417b9125284a529102929c4b5c6727c2976'}, + {'PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch': + '6205d8249e7edcce5756e073ab0b11a0496da34eec1a55e3d24437a530d2886b'}, + {'PyTorch-2.6.0_show-test-duration.patch': '5508f2f9619204d9f3c356dbd4000a00d58f452ab2d64ae920eb8bc8b5484d75'}, + {'PyTorch-2.7.1_suport-64bit-BARs.patch': '317c3d220aa87426d86e137a6c1a8f910adf9580ca0848371e0f6800c05dbde1'}, +] + +osdependencies = [OS_PKG_IBVERBS_DEV] + +builddependencies = [ + ('CMake', '3.27.6'), + ('hypothesis', '6.90.0'), + # For tests + ('pytest-flakefinder', '1.1.0'), + ('pytest-rerunfailures', '14.0'), + ('pytest-shard', '0.1.2'), + ('tlparse', '0.3.5'), + ('optree', '0.13.0'), + ('unittest-xml-reporting', '3.1.0'), +] + +dependencies = [ + ('CUDA', '12.4.0', '', SYSTEM), + ('cuDNN', '9.0.0.312', versionsuffix, SYSTEM), + ('magma', '2.7.2', versionsuffix), + ('NCCL', '2.20.5', versionsuffix), + # Version from .ci/docker/triton_version.txt + ('Triton', '2.3.1', versionsuffix), + ('Ninja', '1.11.1'), # Required for JIT compilation of C++ extensions + ('Python', '3.11.5'), + ('Python-bundle-PyPI', '2023.10'), + ('protobuf', '25.3'), + ('protobuf-python', '4.25.3'), + ('pybind11', '2.11.1'), + ('SciPy-bundle', '2023.11'), + ('PyYAML', '6.0.1'), + ('MPFR', '4.2.1'), + ('GMP', '6.3.0'), + ('numactl', '2.0.16'), + ('FFmpeg', '6.0'), + ('Pillow', '10.2.0'), + ('expecttest', '0.2.1'), + ('networkx', '3.2.1'), + ('sympy', '1.12'), + ('Z3', '4.13.0',), +] + +buildcmd = '%(python)s setup.py build' # Run the (long) build in the build step + +excluded_tests = { + '': [ + # This test seems to take too long on NVIDIA Ampere at least. + 'distributed/test_distributed_spawn', + # Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375 + 'distributions/test_constraints', + # no xdoctest + 'doctests', + # failing on broadwell + # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712 + 'test_native_mha', + # intermittent failures on various systems + # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712 + 'distributed/rpc/test_tensorpipe_agent', + # This test is expected to fail when run in their CI, but won't in our case. + # It just checks for a "CI" env variable + 'test_ci_sanity_check_fail', + # This fails consistently and is disabled upstream + # See https://github.com/pytorch/pytorch/issues/100152 and + # https://github.com/pytorch/pytorch/pull/124712 + 'test_cpp_extensions_open_device_registration', + # Test broken until 2.4: https://github.com/pytorch/pytorch/pull/124786 + 'distributed/checkpoint/test_save_load_api', + # Test broken until 2.4: https://github.com/pytorch/pytorch/issues/122184 + 'distributed/tensor/parallel/test_tp_random_state', + # Doesn't find "dist.all_reduce(" in generated code. Known failures, e.g. + # https://github.com/pytorch/pytorch/issues/121195 + 'distributed/test_compute_comm_reordering', + # Long tests, tested successfully once during creation of EC + 'inductor/test_aot_inductor', # ~65min + 'distributed/fsdp/test_fsdp_state_dict', # ~202min + 'distributed/fsdp/test_fsdp_core', # ~88min + ] +} + +local_test_opts = '--continue-through-error --pipe-logs --verbose %(excluded_tests)s' +runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py ' + local_test_opts + +# Especially test_quantization has a few corner cases that are triggered by the random input values, +# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030 +# So allow a low number of tests to fail as the tests "usually" succeed +max_failed_tests = 16 + +tests = ['PyTorch-check-cpp-extension.py'] + +moduleclass = 'ai' diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-test_fine_tuning.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-test_fine_tuning.patch new file mode 100644 index 000000000000..62dbfe21134d --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-test_fine_tuning.patch @@ -0,0 +1,45 @@ +Fixes +> TypeError: get_state_dict() missing 1 required positional argument: 'optimizers' + +From 61d30b6e8acbd3cfb087761defa74f19f9be96bb Mon Sep 17 00:00:00 2001 +From: cdzhan +Date: Mon, 24 Jun 2024 20:02:08 +0800 +Subject: [PATCH] [easy][DCP] Fix test_fine_tuning.py for get/set_state_dict + API changes + +--- + test/distributed/checkpoint/e2e/test_fine_tuning.py | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/test/distributed/checkpoint/e2e/test_fine_tuning.py b/test/distributed/checkpoint/e2e/test_fine_tuning.py +index a93f242187709c..fd21524882c839 100644 +--- a/test/distributed/checkpoint/e2e/test_fine_tuning.py ++++ b/test/distributed/checkpoint/e2e/test_fine_tuning.py +@@ -9,7 +9,9 @@ + import torch.nn as nn + from torch.distributed._tensor import init_device_mesh + from torch.distributed.checkpoint.state_dict import ( ++ get_model_state_dict, + get_state_dict, ++ set_model_state_dict, + set_state_dict, + StateDictOptions, + ) +@@ -120,7 +122,7 @@ def finetune(self, pretrain_dir: str, finetune_dir: str) -> None: + # Simulate that the fine tuning restart after 3 iterations + for i in range(2): + # Load pretrain submodules checkpoint +- pretrain_state_dict, _ = get_state_dict( ++ pretrain_state_dict = get_model_state_dict( + model, + submodules={model.pretrain}, + options=StateDictOptions(keep_submodule_prefixes=False), +@@ -129,7 +131,7 @@ def finetune(self, pretrain_dir: str, finetune_dir: str) -> None: + {"model": pretrain_state_dict}, + storage_reader=dist_cp.FileSystemReader(pretrain_dir), + ) +- set_state_dict( ++ set_model_state_dict( + model, + model_state_dict={model.pretrain: pretrain_state_dict}, + options=StateDictOptions(strict=False), diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch new file mode 100644 index 000000000000..5c2608ae0144 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_fix-unboxing-template-CUDA-12.4.patch @@ -0,0 +1,77 @@ +From 2ac499392f5f5f2fa518ef74f0e3ab0921b87a2f Mon Sep 17 00:00:00 2001 +From: His-Wardship +Date: Thu, 4 Apr 2024 18:25:24 +0100 +Subject: [PATCH 1/3] Refactor SFINAE logic in boxing with intermediate helper + struct, fixes compilation for CUDA 12.4. + +--- + aten/src/ATen/core/boxing/impl/boxing.h | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h +index f8055d95b8824..0749e6cd59fe1 100644 +--- a/aten/src/ATen/core/boxing/impl/boxing.h ++++ b/aten/src/ATen/core/boxing/impl/boxing.h +@@ -39,7 +39,16 @@ template + struct has_ivalue_to : std::false_type {}; + + template +-struct has_ivalue_to().to())>> ++struct ivalue_to_helper ++{ ++ using type = decltype(std::declval().template to()); ++}; ++template ++using ivalue_to_helper_t = typename ivalue_to_helper::type; ++ ++template ++struct has_ivalue_to>> ++ + : std::true_type + {}; + + +From 1af7507f410337221131142210d23504c98e38b4 Mon Sep 17 00:00:00 2001 +From: His-Wardship <139779341+His-Wardship@users.noreply.github.com> +Date: Thu, 4 Apr 2024 20:19:10 +0100 +Subject: [PATCH 2/3] remove superfluous blank line boxing.h + +--- + aten/src/ATen/core/boxing/impl/boxing.h | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h +index 0749e6cd59fe1..8a88f2e656786 100644 +--- a/aten/src/ATen/core/boxing/impl/boxing.h ++++ b/aten/src/ATen/core/boxing/impl/boxing.h +@@ -48,7 +48,6 @@ using ivalue_to_helper_t = typename ivalue_to_helper::type; + + template + struct has_ivalue_to>> +- + : std::true_type + {}; + + +From 25f691941ccb7fad35a8d832738ae9b2c0f88b0f Mon Sep 17 00:00:00 2001 +From: His-Wardship <139779341+His-Wardship@users.noreply.github.com> +Date: Sun, 7 Apr 2024 12:33:47 +0100 +Subject: [PATCH 3/3] update void_t to use std namespace + +--- + aten/src/ATen/core/boxing/impl/boxing.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h +index 8a88f2e656786..82fdd824ea65b 100644 +--- a/aten/src/ATen/core/boxing/impl/boxing.h ++++ b/aten/src/ATen/core/boxing/impl/boxing.h +@@ -47,7 +47,7 @@ template + using ivalue_to_helper_t = typename ivalue_to_helper::type; + + template +-struct has_ivalue_to>> ++struct has_ivalue_to>> + : std::true_type + {}; + diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch new file mode 100644 index 000000000000..99e43d05aa47 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch @@ -0,0 +1,19 @@ +test_jit, test_jit_legacy, test_jit_profiling fail in test_freeze_conv_relu_fusion with: +> Greatest absolute difference: 3.053247928619385e-05 at index (1, 1, 0, 0, 0) (up to 1e-05 allowed) +> Greatest relative difference: 0.0004548609140329063 at index (3, 1, 0, 0, 0) (up to 1.3e-06 allowed) + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py +index f13c2b113b4..454df769d9c 100644 +--- a/test/jit/test_freezing.py ++++ b/test/jit/test_freezing.py +@@ -2795,7 +2795,7 @@ class TestFrozenOptimizations(JitTestCase): + else: + FileCheck().check("aten::cudnn_convolution_relu").run(frozen_mod.graph) + +- self.assertEqual(mod_eager(inp), frozen_mod(inp)) ++ self.assertEqual(mod_eager(inp), frozen_mod(inp), atol=4e-5, rtol=5e-4) + + @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN") + def test_freeze_conv_relu_fusion_not_forward(self): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_relax-test_unbacked_reduction.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_relax-test_unbacked_reduction.patch new file mode 100644 index 000000000000..b084fdf5ecf2 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_relax-test_unbacked_reduction.patch @@ -0,0 +1,19 @@ +With our Triton version the expected failure doesn't happen anymore. +See also https://github.com/pytorch/pytorch/issues/154217 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py +index 11de2175fc4..68d33a9bb48 100644 +--- a/test/inductor/test_torchinductor_dynamic_shapes.py ++++ b/test/inductor/test_torchinductor_dynamic_shapes.py +@@ -367,9 +367,6 @@ class TestInductorDynamic(TestCase): + except Exception: + if not expect_fail: + raise +- else: +- if expect_fail: +- self.fail("expected to fail, but actually passed") + + @torch._dynamo.config.patch( + capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_remove-fsspec-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_remove-fsspec-test.patch new file mode 100644 index 000000000000..b207270fd1ab --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0_remove-fsspec-test.patch @@ -0,0 +1,67 @@ +Use part of a revert commit to avoid failures caused by: +> storage_reader = storage_reader or DCP.FileSystemReader() +> ^^^^^^^^^^^^^^^^^^^^^^ +> TypeError: FileSystemReader.__init__() missing 1 required positional argument: 'path' + +Author: Alexander Grund (TU Dresden) + +--- a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py ++++ b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py +@@ -171,28 +171,7 @@ class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin): + def test_e2e_async(self): + self._run_e2e_test(compile=False, model_type=ModelType.FSDP, async_op=True) + +- @with_comms +- @skip_if_lt_x_gpu(4) +- @with_temp_dir +- def test_fsspec(self): +- self._run_e2e_test( +- compile=False, +- model_type=ModelType.FSDP, +- storage_reader=DCP.FsspecReader(), +- storage_writer=DCP.FsspecWriter(), +- ) +- +- def _run_e2e_test( +- self, +- compile, +- model_type, +- async_op=False, +- storage_reader=None, +- storage_writer=None, +- ): +- storage_reader = storage_reader or DCP.FileSystemReader() +- storage_writer = storage_writer or DCP.FileSystemWriter() +- ++ def _run_e2e_test(self, compile, model_type, async_op=False): + model, optim = self._create_model(compile, ModelType.NONE) + _train(model, optim, train_steps=2) + +@@ -207,9 +186,7 @@ class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin): + } + + if async_op: +- f = saver.async_save( +- sd, checkpoint_id=self.temp_dir, storage_writer=storage_writer +- ) ++ f = saver.async_save(sd, checkpoint_id=self.temp_dir) + t = time.monotonic() + while not f.done(): + time.sleep(1) +@@ -217,7 +194,7 @@ class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin): + + f.result() + else: +- DCP.save(sd, checkpoint_id=self.temp_dir, storage_writer=storage_writer) ++ DCP.save(sd, checkpoint_id=self.temp_dir) + + loaded_stateful_obj = TestStatefulObj() + dist_model, dist_optim = self._create_model(compile, model_type) +@@ -232,7 +209,6 @@ class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin): + "s": loaded_stateful_obj, + }, + checkpoint_id=self.temp_dir, +- storage_reader=storage_reader, + ) + + self.assertEqual(original_stateful_obj, loaded_stateful_obj) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch new file mode 100644 index 000000000000..6e8cdfb2d36a --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.7.1_suport-64bit-BARs.patch @@ -0,0 +1,27 @@ +When the GPUs use 64bit BARs the RPC module fails during the initialization with: +> E RuntimeError: In getBar1SizeOfGpu at tensorpipe/channel/cuda_gdr/context_impl.cc:242 "": No such file or directory + +This causes KeyboardInterrupt errors in distributed/rpc/test_share_memory + +See https://github.com/pytorch/pytorch/issues/159354 + +Author: Alexander Grund (TU Dresden) + +diff --git a/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc b/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc +index 182a04a..b26751e 100644 +--- a/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc ++++ b/third_party/tensorpipe/tensorpipe/channel/cuda_gdr/context_impl.cc +@@ -239,6 +239,13 @@ size_t getBar1SizeOfGpu(int gpuIdx) { + + struct stat bar1Stats; + int rv = ::stat(pciPath.c_str(), &bar1Stats); ++ if (rv < 0 && errno == ENOENT) { ++ // Some GPUs use 64 bit BARs using 2 slots each, ++ // so the BAR 0 spans slots 0 & 1 and BAR 1 is at slots 2 & 3 ++ TP_VLOG(5) << "GPU #" << gpuIdx << " might has 64 bit BARs"; ++ pciPath[pciPath.size() - 1] = '2'; ++ rv = ::stat(pciPath.c_str(), &bar1Stats); ++ } + TP_THROW_SYSTEM_IF(rv < 0, errno); + + return bar1Stats.st_size;