Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
576edac
adding easyconfigs: parameterized-0.9.0-GCCcore-14.3.0.eb, pytest-sub…
Flamefire Dec 18, 2025
5e96b25
Add testcase
Flamefire Dec 19, 2025
9f728f6
Add patch for GCC 14 ARM builds
Flamefire Dec 19, 2025
54f0441
Also ignore warning for C files
Flamefire Dec 19, 2025
b921399
Move flags setting before including dependencies
Flamefire Dec 19, 2025
dd7464f
Use flag only for C
Flamefire Dec 19, 2025
54b64ef
Add workaround for GCC 14 ICE
Flamefire Dec 19, 2025
bf271b1
Remove already included patch
Flamefire Jan 5, 2026
5e4e033
Add missing patch
Flamefire Jan 5, 2026
20c68d3
Skip tests requiring CUDA SM 9.0
Flamefire Jan 7, 2026
dc3a09e
Remove old patch
Flamefire Jan 7, 2026
39cf857
Add patch avoiding infinite test hang
Flamefire Jan 8, 2026
caa6bf0
Add patch avoiding infinite test hang
Flamefire Jan 15, 2026
0a6dde0
Add patch avoiding infinite test hang
Flamefire Jan 22, 2026
f093e09
More patches
Flamefire Jan 22, 2026
0ecb16a
Fix patched skip markers
Flamefire Feb 5, 2026
3d7005b
Add comment for DISABLE_ADDR2LINE
Flamefire Feb 5, 2026
26ab819
Set test timeout
Flamefire Feb 9, 2026
dae9b55
Add GCC 14 patch
Flamefire Feb 10, 2026
0bb1f1a
Add missing patch
Flamefire Feb 10, 2026
c22bcee
Add patches for test fixes and skip slow&disabled tests
Flamefire Feb 12, 2026
03217d9
Add PyTorch-2.6.0_fix-server-in-test_control_plane
Flamefire Feb 13, 2026
2441b4f
Merge branch 'easybuilders:develop' into 20251218180340_new_pr_parame…
Flamefire Feb 13, 2026
30594ca
Merge branch 'easybuilders:develop' into 20251218180340_new_pr_parame…
Flamefire Feb 16, 2026
4c09c91
Merge branch 'develop' into 20251218180340_new_pr_parameterized090
Flamefire Feb 24, 2026
db7aefc
Fix race condition in checking for disabled tests
Flamefire Mar 3, 2026
9fe965e
Remove pytest-shard
Flamefire Mar 4, 2026
c931bb7
Add more patches
Flamefire Mar 5, 2026
c30732f
Fix using wrong OpenMP library
Flamefire Mar 11, 2026
3b6de18
Skip segfaulting flex_attention suite
Flamefire Mar 11, 2026
c2dde09
Skip some tests failing on ARM
Flamefire Mar 13, 2026
bd0c213
Merge branch 'easybuilders:develop' into 20251218180340_new_pr_parame…
Flamefire Mar 19, 2026
92cb18d
Fix Python 3.13 compat
Flamefire Mar 20, 2026
0554998
Merge branch 'easybuilders:develop' into 20251218180340_new_pr_parame…
Flamefire Mar 30, 2026
e83332d
Disable sanity_check_pip_list
Flamefire Apr 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
Test on Python 3.13 fails with
> AttributeError: 'functools.partial' object has no attribute 'value'

Fix using https://github.com/pytorch/pytorch/pull/163939

diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -1,7 +1,21 @@
# mypy: allow-untyped-defs
+import sys
from enum import Enum
from functools import partial

+
+# To suppress FutureWarning from partial since 3.13
+if sys.version_info >= (3, 13):
+ from enum import member
+
+ def _enum_member(x):
+ return member(x)
+else:
+
+ def _enum_member(x):
+ return x
+
+
import torch.distributed as dist

from . import (
@@ -51,45 +65,61 @@ class DDPCommHookType(Enum):
``DDPCommHookType.ALLREDUCE.value(model=model, state=process_group)``.
"""

- ALLREDUCE = partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
- FP16_COMPRESS = partial(
- _ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook
+ ALLREDUCE = _enum_member(
+ partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
+ )
+ FP16_COMPRESS = _enum_member(
+ partial(_ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook)
)
- BF16_COMPRESS = partial(
- _ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook
+ BF16_COMPRESS = _enum_member(
+ partial(_ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook)
)
- QUANTIZE_PER_TENSOR = partial(
- _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
+ QUANTIZE_PER_TENSOR = _enum_member(
+ partial(
+ _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
+ )
)
- QUANTIZE_PER_CHANNEL = partial(
- _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
+ QUANTIZE_PER_CHANNEL = _enum_member(
+ partial(
+ _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
+ )
)
- POWER_SGD = partial(
- _powerSGD_comm_hook_wrapper,
- comm_hook=powerSGD.powerSGD_hook,
- matrix_approximation_rank=1,
+ POWER_SGD = _enum_member(
+ partial(
+ _powerSGD_comm_hook_wrapper,
+ comm_hook=powerSGD.powerSGD_hook,
+ matrix_approximation_rank=1,
+ )
)
# Rank-2 PowerSGD can give a higher accuracy than the default rank-1 version,
# but it runs slower and consumes more memory.
- POWER_SGD_RANK2 = partial(
- _powerSGD_comm_hook_wrapper,
- comm_hook=powerSGD.powerSGD_hook,
- matrix_approximation_rank=2,
+ POWER_SGD_RANK2 = _enum_member(
+ partial(
+ _powerSGD_comm_hook_wrapper,
+ comm_hook=powerSGD.powerSGD_hook,
+ matrix_approximation_rank=2,
+ )
)
# Batching can lead to a faster training at the cost of accuracy.
- BATCHED_POWER_SGD = partial(
- _powerSGD_comm_hook_wrapper,
- comm_hook=powerSGD.batched_powerSGD_hook,
- matrix_approximation_rank=1,
+ BATCHED_POWER_SGD = _enum_member(
+ partial(
+ _powerSGD_comm_hook_wrapper,
+ comm_hook=powerSGD.batched_powerSGD_hook,
+ matrix_approximation_rank=1,
+ )
)
- BATCHED_POWER_SGD_RANK2 = partial(
- _powerSGD_comm_hook_wrapper,
- comm_hook=powerSGD.batched_powerSGD_hook,
- matrix_approximation_rank=2,
+ BATCHED_POWER_SGD_RANK2 = _enum_member(
+ partial(
+ _powerSGD_comm_hook_wrapper,
+ comm_hook=powerSGD.batched_powerSGD_hook,
+ matrix_approximation_rank=2,
+ )
)
- NOOP = partial(
- _ddp_comm_hook_wrapper,
- comm_hook=debugging.noop_hook,
+ NOOP = _enum_member(
+ partial(
+ _ddp_comm_hook_wrapper,
+ comm_hook=debugging.noop_hook,
+ )
)


Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
Avoid "cannot pickle code objects" on Python 3.13+

Extracted from https://github.com/pytorch/pytorch/pull/177713
diff --git a/torch/distributed/checkpoint/api.py b/torch/distributed/checkpoint/api.py
--- a/torch/distributed/checkpoint/api.py
+++ b/torch/distributed/checkpoint/api.py
@@ -8,7 +8,15 @@ __all__ = ["CheckpointException"]


def _wrap_exception(exc: BaseException) -> WRAPPED_EXCEPTION:
- return (exc, tb.extract_tb(exc.__traceback__))
+ summary = tb.extract_tb(exc.__traceback__)
+ # Python 3.13+ stores bytecode objects in FrameSummary._code,
+ # which cannot be pickled. Clear them so gather_object succeeds
+ # and the real exception is reported instead of a misleading
+ # "cannot pickle code objects" TypeError.
+ for frame in summary:
+ if hasattr(frame, "_code"):
+ object.__setattr__(frame, "_code", None)
+ return (exc, summary)


def _is_wrapped_exception(obj: Any) -> bool:
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
Fix a RecursionError inside pytest when running this test.
See https://github.com/pytorch/pytorch/pull/174693

Author: Alexander Grund (TU Dresden)

diff --git a/test/dynamo/cpython/3_13/test_exceptions.py b/test/dynamo/cpython/3_13/test_exceptions.py
index 0ded70db3c7..bc8120a2d19 100644
--- a/test/dynamo/cpython/3_13/test_exceptions.py
+++ b/test/dynamo/cpython/3_13/test_exceptions.py
@@ -1573,18 +1573,18 @@ class ExceptionTests(__TestCase):
recurse_in_body_and_except()

recursionlimit = sys.getrecursionlimit()
- try:
- set_relative_recursion_limit(10)
- for func in (recurse_in_except, recurse_after_except, recurse_in_body_and_except):
- with self.subTest(func=func):
+ for func in (recurse_in_except, recurse_after_except, recurse_in_body_and_except):
+ with self.subTest(func=func):
+ try:
+ set_relative_recursion_limit(10)
try:
func()
except RecursionError:
pass
else:
self.fail("Should have raised a RecursionError")
- finally:
- sys.setrecursionlimit(recursionlimit)
+ finally:
+ sys.setrecursionlimit(recursionlimit)


@cpython_only
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
From 08de54f1ea954a6da3b45d794972d3df3d72df02 Mon Sep 17 00:00:00 2001
From: Rob Timpe <rtimpe@openteams.com>
Date: Thu, 13 Nov 2025 02:23:06 +0000
Subject: [PATCH] [3.14] Skip failing spherical_bessel_j0 tests (#167691)

Starting with scipy 1.15, bool inputs error out.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/167691
Approved by: https://github.com/williamwen42
---
.../_internal/opinfo/definitions/special.py | 20 +++++++++++++++++++
1 file changed, 20 insertions(+)

diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
index f9dc471ca98aa..47cbcb1fb4268 100644
--- a/torch/testing/_internal/opinfo/definitions/special.py
+++ b/torch/testing/_internal/opinfo/definitions/special.py
@@ -648,6 +648,16 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
dtypes=all_types_and(torch.bool),
ref=lambda x: scipy.special.spherical_jn(0, x) if TEST_SCIPY else None,
supports_autograd=False,
+ skips=(
+ DecorateInfo(
+ unittest.skip(
+ "Scipy doesn't support bool inputs to spherical_bessel_j0"
+ ),
+ "TestUnaryUfuncs",
+ "test_reference_numerics_normal",
+ dtypes=(torch.bool,),
+ ),
+ ),
),
]

@@ -768,6 +778,16 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
}
),
),
+ skips=(
+ DecorateInfo(
+ unittest.skip(
+ "Scipy doesn't support bool inputs to spherical_bessel_j0"
+ ),
+ "TestUnaryUfuncs",
+ "test_reference_numerics_normal",
+ dtypes=(torch.bool,),
+ ),
+ ),
),
#
# Elementwise Binary Special OpInfos
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
The test fails with
> AssertionError: 'fail_once policy triggered failure' not found in 'cannot pickle code objects'

This is caused by a change in Python 3.13 although it only worked by accident in earlier versions.
See https://github.com/pytorch/pytorch/issues/174669

Author: Alexander Grund (TU Dresden)

diff --git a/test/distributed/checkpoint/test_async_process_executor.py b/test/distributed/checkpoint/test_async_process_executor.py
index 9dc7095b0d6..36e639803b2 100644
--- a/test/distributed/checkpoint/test_async_process_executor.py
+++ b/test/distributed/checkpoint/test_async_process_executor.py
@@ -1,6 +1,7 @@
# Owner(s): ["oncall: distributed checkpointing"]

import sys
+import unittest
from unittest.mock import patch

import torch
@@ -100,6 +101,7 @@ class TestStorageWriter(StorageWriter):
class TestAsyncProcessExecutor(DTensorTestBase):
"""Test suite for async checkpoint process executor error handling using public APIs."""

+ @unittest.skipIf(sys.version_info >= (3, 13), "Can't pickle tracebacks")
@with_comms
def test_checkpoint_save_failure_continues_serving(self) -> None:
"""Test that checkpoint save failure doesn't exit process, continues serving."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
This test no longer works with numpy >= 2.3.0
See https://github.com/pytorch/pytorch/commit/a4a5d03779d876043b0a1f0c565659fc2298afd2

Author: Alexander Grund (TU Dresden)
diff --git a/test/test_linalg.py b/test/test_linalg.py
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -2040,6 +2040,7 @@ class TestLinalg(TestCase):
run_test_case(input, ord, dim, keepdim)

# Test degenerate shape results match numpy for linalg.norm matrix norms
+ @unittest.skipIf(np.lib.NumpyVersion(np.__version__) >= '2.3.0', 'Numpy changed handling of degenerate inputs in 2.3.0')
@skipCUDAIfNoMagma
@skipCPUIfNoLapack
@dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
Loading