easybuilders · Flamefire · Dec 18, 2025 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2025b-CUDA-12.9.1.eb
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-DDPCommHookType-python-3.13.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-DDPCommHookType-python-3.13.patch
@@ -0,0 +1,120 @@
+Test on Python 3.13 fails with
+> AttributeError: 'functools.partial' object has no attribute 'value'
+
+Fix using https://github.com/pytorch/pytorch/pull/163939
+
+diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
++++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+@@ -1,7 +1,21 @@
+ # mypy: allow-untyped-defs
++import sys
+ from enum import Enum
+ from functools import partial
+
++
++# To suppress FutureWarning from partial since 3.13
++if sys.version_info >= (3, 13):
++    from enum import member
++
++    def _enum_member(x):
++        return member(x)
++else:
++
++    def _enum_member(x):
++        return x
++
++
+ import torch.distributed as dist
+
+ from . import (
+@@ -51,45 +65,61 @@ class DDPCommHookType(Enum):
+     ``DDPCommHookType.ALLREDUCE.value(model=model, state=process_group)``.
+     """
+
+-    ALLREDUCE = partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
+-    FP16_COMPRESS = partial(
+-        _ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook
++    ALLREDUCE = _enum_member(
++        partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
++    )
++    FP16_COMPRESS = _enum_member(
++        partial(_ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook)
+     )
+-    BF16_COMPRESS = partial(
+-        _ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook
++    BF16_COMPRESS = _enum_member(
++        partial(_ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook)
+     )
+-    QUANTIZE_PER_TENSOR = partial(
+-        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
++    QUANTIZE_PER_TENSOR = _enum_member(
++        partial(
++            _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
++        )
+     )
+-    QUANTIZE_PER_CHANNEL = partial(
+-        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
++    QUANTIZE_PER_CHANNEL = _enum_member(
++        partial(
++            _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
++        )
+     )
+-    POWER_SGD = partial(
+-        _powerSGD_comm_hook_wrapper,
+-        comm_hook=powerSGD.powerSGD_hook,
+-        matrix_approximation_rank=1,
++    POWER_SGD = _enum_member(
++        partial(
++            _powerSGD_comm_hook_wrapper,
++            comm_hook=powerSGD.powerSGD_hook,
++            matrix_approximation_rank=1,
++        )
+     )
+     # Rank-2 PowerSGD can give a higher accuracy than the default rank-1 version,
+     # but it runs slower and consumes more memory.
+-    POWER_SGD_RANK2 = partial(
+-        _powerSGD_comm_hook_wrapper,
+-        comm_hook=powerSGD.powerSGD_hook,
+-        matrix_approximation_rank=2,
++    POWER_SGD_RANK2 = _enum_member(
++        partial(
++            _powerSGD_comm_hook_wrapper,
++            comm_hook=powerSGD.powerSGD_hook,
++            matrix_approximation_rank=2,
++        )
+     )
+     # Batching can lead to a faster training at the cost of accuracy.
+-    BATCHED_POWER_SGD = partial(
+-        _powerSGD_comm_hook_wrapper,
+-        comm_hook=powerSGD.batched_powerSGD_hook,
+-        matrix_approximation_rank=1,
++    BATCHED_POWER_SGD = _enum_member(
++        partial(
++            _powerSGD_comm_hook_wrapper,
++            comm_hook=powerSGD.batched_powerSGD_hook,
++            matrix_approximation_rank=1,
++        )
+     )
+-    BATCHED_POWER_SGD_RANK2 = partial(
+-        _powerSGD_comm_hook_wrapper,
+-        comm_hook=powerSGD.batched_powerSGD_hook,
+-        matrix_approximation_rank=2,
++    BATCHED_POWER_SGD_RANK2 = _enum_member(
++        partial(
++            _powerSGD_comm_hook_wrapper,
++            comm_hook=powerSGD.batched_powerSGD_hook,
++            matrix_approximation_rank=2,
++        )
+     )
+-    NOOP = partial(
+-        _ddp_comm_hook_wrapper,
+-        comm_hook=debugging.noop_hook,
++    NOOP = _enum_member(
++        partial(
++            _ddp_comm_hook_wrapper,
++            comm_hook=debugging.noop_hook,
++        )
+     )
+
+
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-pickle-error-on-Python-3.13.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-pickle-error-on-Python-3.13.patch
@@ -0,0 +1,23 @@
+Avoid "cannot pickle code objects" on Python 3.13+
+
+Extracted from https://github.com/pytorch/pytorch/pull/177713
+diff --git a/torch/distributed/checkpoint/api.py b/torch/distributed/checkpoint/api.py
+--- a/torch/distributed/checkpoint/api.py
++++ b/torch/distributed/checkpoint/api.py
+@@ -8,7 +8,15 @@ __all__ = ["CheckpointException"]
+
+
+ def _wrap_exception(exc: BaseException) -> WRAPPED_EXCEPTION:
+-    return (exc, tb.extract_tb(exc.__traceback__))
++    summary = tb.extract_tb(exc.__traceback__)
++    # Python 3.13+ stores bytecode objects in FrameSummary._code,
++    # which cannot be pickled. Clear them so gather_object succeeds
++    # and the real exception is reported instead of a misleading
++    # "cannot pickle code objects" TypeError.
++    for frame in summary:
++        if hasattr(frame, "_code"):
++            object.__setattr__(frame, "_code", None)
++    return (exc, summary)
+
+
+ def _is_wrapped_exception(obj: Any) -> bool:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_recursion_in_except_handler.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_fix-test_recursion_in_except_handler.patch
@@ -0,0 +1,34 @@
+Fix a RecursionError inside pytest when running this test.
+See https://github.com/pytorch/pytorch/pull/174693
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/dynamo/cpython/3_13/test_exceptions.py b/test/dynamo/cpython/3_13/test_exceptions.py
+index 0ded70db3c7..bc8120a2d19 100644
+--- a/test/dynamo/cpython/3_13/test_exceptions.py
++++ b/test/dynamo/cpython/3_13/test_exceptions.py
+@@ -1573,18 +1573,18 @@ class ExceptionTests(__TestCase):
+                 recurse_in_body_and_except()
+
+         recursionlimit = sys.getrecursionlimit()
+-        try:
+-            set_relative_recursion_limit(10)
+-            for func in (recurse_in_except, recurse_after_except, recurse_in_body_and_except):
+-                with self.subTest(func=func):
++        for func in (recurse_in_except, recurse_after_except, recurse_in_body_and_except):
++            with self.subTest(func=func):
++                try:
++                    set_relative_recursion_limit(10)
+                     try:
+                         func()
+                     except RecursionError:
+                         pass
+                     else:
+                         self.fail("Should have raised a RecursionError")
+-        finally:
+-            sys.setrecursionlimit(recursionlimit)
++                finally:
++                    sys.setrecursionlimit(recursionlimit)
+
+
+     @cpython_only
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-bool-bessel-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-bool-bessel-tests.patch
@@ -0,0 +1,50 @@
+From 08de54f1ea954a6da3b45d794972d3df3d72df02 Mon Sep 17 00:00:00 2001
+From: Rob Timpe <rtimpe@openteams.com>
+Date: Thu, 13 Nov 2025 02:23:06 +0000
+Subject: [PATCH] [3.14] Skip failing spherical_bessel_j0 tests (#167691)
+
+Starting with scipy 1.15, bool inputs error out.
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/167691
+Approved by: https://github.com/williamwen42
+---
+ .../_internal/opinfo/definitions/special.py   | 20 +++++++++++++++++++
+ 1 file changed, 20 insertions(+)
+
+diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
+index f9dc471ca98aa..47cbcb1fb4268 100644
+--- a/torch/testing/_internal/opinfo/definitions/special.py
++++ b/torch/testing/_internal/opinfo/definitions/special.py
+@@ -648,6 +648,16 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
+         dtypes=all_types_and(torch.bool),
+         ref=lambda x: scipy.special.spherical_jn(0, x) if TEST_SCIPY else None,
+         supports_autograd=False,
++        skips=(
++            DecorateInfo(
++                unittest.skip(
++                    "Scipy doesn't support bool inputs to spherical_bessel_j0"
++                ),
++                "TestUnaryUfuncs",
++                "test_reference_numerics_normal",
++                dtypes=(torch.bool,),
++            ),
++        ),
+     ),
+ ]
+
+@@ -768,6 +778,16 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
+                 }
+             ),
+         ),
++        skips=(
++            DecorateInfo(
++                unittest.skip(
++                    "Scipy doesn't support bool inputs to spherical_bessel_j0"
++                ),
++                "TestUnaryUfuncs",
++                "test_reference_numerics_normal",
++                dtypes=(torch.bool,),
++            ),
++        ),
+     ),
+     #
+     # Elementwise Binary Special OpInfos
diff --git a/...configs/p/PyTorch/PyTorch-2.9.1_skip-test_checkpoint_save_failure_continues_serving.patch b/...configs/p/PyTorch/PyTorch-2.9.1_skip-test_checkpoint_save_failure_continues_serving.patch
@@ -0,0 +1,28 @@
+The test fails with
+> AssertionError: 'fail_once policy triggered failure' not found in 'cannot pickle code objects'
+
+This is caused by a change in Python 3.13 although it only worked by accident in earlier versions.
+See https://github.com/pytorch/pytorch/issues/174669
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/checkpoint/test_async_process_executor.py b/test/distributed/checkpoint/test_async_process_executor.py
+index 9dc7095b0d6..36e639803b2 100644
+--- a/test/distributed/checkpoint/test_async_process_executor.py
++++ b/test/distributed/checkpoint/test_async_process_executor.py
+@@ -1,6 +1,7 @@
+ # Owner(s): ["oncall: distributed checkpointing"]
+
+ import sys
++import unittest
+ from unittest.mock import patch
+
+ import torch
+@@ -100,6 +101,7 @@ class TestStorageWriter(StorageWriter):
+ class TestAsyncProcessExecutor(DTensorTestBase):
+     """Test suite for async checkpoint process executor error handling using public APIs."""
+
++    @unittest.skipIf(sys.version_info >= (3, 13), "Can't pickle tracebacks")
+     @with_comms
+     def test_checkpoint_save_failure_continues_serving(self) -> None:
+         """Test that checkpoint save failure doesn't exit process, continues serving."""
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_norm_matrix_degenerate_shapes.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_norm_matrix_degenerate_shapes.patch
@@ -0,0 +1,15 @@
+This test no longer works with numpy >= 2.3.0
+See https://github.com/pytorch/pytorch/commit/a4a5d03779d876043b0a1f0c565659fc2298afd2
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/test_linalg.py b/test/test_linalg.py
+--- a/test/test_linalg.py
++++ b/test/test_linalg.py
+@@ -2040,6 +2040,7 @@ class TestLinalg(TestCase):
+                     run_test_case(input, ord, dim, keepdim)
+
+     # Test degenerate shape results match numpy for linalg.norm matrix norms
++    @unittest.skipIf(np.lib.NumpyVersion(np.__version__) >= '2.3.0', 'Numpy changed handling of degenerate inputs in 2.3.0')
+     @skipCUDAIfNoMagma
+     @skipCPUIfNoLapack
+     @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)