Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,19 @@ patches = [
'PyTorch-2.1.2_add-cuda-skip-markers.patch',
'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch',
'PyTorch-2.1.2_fix-device-mesh-check.patch',
'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch',
'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch',
'PyTorch-2.1.2_fix-test_memory_profiler.patch',
'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch',
'PyTorch-2.1.2_fix-vsx-vector-abs.patch',
'PyTorch-2.1.2_fix-vsx-vector-div.patch',
'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch',
'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch',
'PyTorch-2.1.2_relax-cuda-tolerances.patch',
'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch',
'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch',
'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch',
'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch',
'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch',
]
checksums = [
Expand Down Expand Up @@ -120,6 +124,8 @@ checksums = [
{'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch':
'c164357efa4ce88095376e590ba508fc1daa87161e1e59544eda56daac7f2847'},
{'PyTorch-2.1.2_fix-device-mesh-check.patch': 'c0efc288bf3d9a9a3c8bbd2691348a589a2677ea43880a8c987db91c8de4806b'},
{'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch':
'f7adafb4e4d3b724b93237a259797b6ed6f535f83be0e34a7b759c71c6a8ddf2'},
{'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch':
'cd1455495886a7d6b2d30d48736eb0103fded21e2e36de6baac719b9c52a1c92'},
{'PyTorch-2.1.2_fix-test_memory_profiler.patch':
Expand All @@ -128,13 +134,19 @@ checksums = [
'a0ef99192ee2ad1509c78a8377023d5be2b5fddb16f84063b7c9a0b53d979090'},
{'PyTorch-2.1.2_fix-vsx-vector-abs.patch': 'd67d32407faed7dc1dbab4bba0e2f7de36c3db04560ced35c94caf8d84ade886'},
{'PyTorch-2.1.2_fix-vsx-vector-div.patch': '11f497a6892eb49b249a15320e4218e0d7ac8ae4ce67de39e4a018a064ca1acc'},
{'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch':
'90bd001e034095329277d70c6facc4026b4ce6d7f8b8d6aa81c0176eeb462eb1'},
{'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch':
'07a5e4233d02fb6348872838f4d69573c777899c6f0ea4e39ae23c08660d41e5'},
{'PyTorch-2.1.2_relax-cuda-tolerances.patch': '554ad09787f61080fafdb84216e711e32327aa357e2a9c40bb428eb6503dee6e'},
{'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch':
'e6a1efe3d127fcbf4723476a7a1c01cfcf2ccb16d1fb250f478192623e8b6a15'},
{'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch':
'7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'},
{'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch':
'6cf711bf26518550903b09ed4431de9319791e79d61aab065785d6608fd5cc88'},
{'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch':
'943ee92f5fd518f608a59e43fe426b9bb45d7e7ad0ba04639e516db2d61fa57d'},
{'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch':
'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'},
]
Expand Down Expand Up @@ -191,6 +203,8 @@ excluded_tests = {
# intermittent failures on various systems
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'distributed/rpc/test_tensorpipe_agent',
# Broken test, can't ever succeed, see https://github.com/pytorch/pytorch/issues/122184
'distributed/tensor/parallel/test_tp_random_state',
# failures on OmniPath systems, which don't support some optional InfiniBand features
# See https://github.com/pytorch/tensorpipe/issues/413
'distributed/pipeline/sync/skip/test_gpipe',
Expand All @@ -206,8 +220,9 @@ runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-throu

# Especially test_quantization has a few corner cases that are triggered by the random input values,
# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
# test_nn is also prone to spurious failures: https://github.com/pytorch/pytorch/issues/118294
# So allow a low number of tests to fail as the tests "usually" succeed
max_failed_tests = 50
max_failed_tests = 5

# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/109493 is accepted, since it is then checked as part of the PyTorch test suite
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
There is a bug in CUDA 11.7 through at least CUDA 12.4 which changes the current thread locale
when calling nvrtcCompileProgram.
See e.g. https://stackoverflow.com/questions/74044994
This also includes the encoding used by Python by default for e.g. subsequent invocations of `subprocess` calls.
When the user environment is now set to e.g. UTF-8 and changed by CUDA (to ASCII/ANSI_X3.4-1968) Python will fail
to decode UTF-8 output from programs invoked.
This happens e.g. in `test_torch` which calls `from scipy import stats` which runs `lscpu` and errors
with something like
> /software/SciPy-bundle/2023.07-gfbf-2023a/lib/python3.11/site-packages/numpy/testing/_private/utils.py", line 1253, in <module>
> _SUPPORTS_SVE = check_support_sve()
> /software/SciPy-bundle/2023.07-gfbf-2023a/lib/python3.11/site-packages/numpy/testing/_private/utils.py", line 1247, in check_support_sve
> output = subprocess.run(cmd, capture_output=True, text=True)
> /software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 2113, in _communicate
> stdout = self._translate_newlines(stdout,
> UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 96: ordinal not in range(128)

Fix by wrapping the nvrtcCompileProgram saving and restoring the thread locale.

Author: Alexander Grund (TU Dresden)

diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
index 1b85e7776e2..1c13a3b1168 100644
--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@@ -2,6 +2,7 @@

#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
#include <ATen/DynamicLibrary.h>
+#include <locale.h>
#include <stdexcept>

namespace at {
@@ -143,6 +144,29 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
return fn(prog, src, name, numHeaders, headers, includeNames);
}

+nvrtcResult nvrtcCompileProgram_wrapped(nvrtcProgram prog,
+ int numOptions,
+ const char * const *options) {
+ // Save & restore current thread locale which can get modified by nvrtcCompileProgram
+ locale_t oldLocale = uselocale((locale_t) 0);
+ auto result = lazyNVRTC.nvrtcCompileProgram_real(prog, numOptions, options);
+ if (oldLocale != (locale_t) 0)
+ uselocale(oldLocale);
+ return result;
+}
+
+nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
+ int numOptions,
+ const char * const *options) {
+ auto fn = reinterpret_cast<decltype(&nvrtcCompileProgram)>(getNVRTCLibrary().sym(__func__));
+ if (!fn)
+ throw std::runtime_error("Can't get nvrtcCompileProgram");
+ lazyNVRTC.nvrtcCompileProgram_real = fn;
+ fn = &nvrtcCompileProgram_wrapped;
+ lazyNVRTC.nvrtcCompileProgram = fn;
+ return fn(prog, numOptions, options);
+}
+
NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *);
NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *);
NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *);
@@ -150,7 +174,6 @@ NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *);
NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *);
NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *);
#endif
-NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *);
_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult);
NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*);
NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *);
diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
index 574b2c41c26..4ddc5316dad 100644
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@@ -120,6 +120,8 @@ extern "C" typedef struct NVRTC {
#define CREATE_MEMBER(name) decltype(&name) name;
AT_FORALL_NVRTC(CREATE_MEMBER)
#undef CREATE_MEMBER
+ // Must be at end!
+ decltype(nvrtcCompileProgram) nvrtcCompileProgram_real;
} NVRTC;

extern "C" TORCH_CUDA_CPP_API NVRTC* load_nvrtc();
diff --git a/caffe2/cuda_rtc/common_rtc.h b/caffe2/cuda_rtc/common_rtc.h
index 9d9582d34b6..562a653a67a 100644
--- a/caffe2/cuda_rtc/common_rtc.h
+++ b/caffe2/cuda_rtc/common_rtc.h
@@ -1,6 +1,7 @@
#ifndef CAFFE2_CUDA_RTC_COMMON_RTC_H_
#define CAFFE2_CUDA_RTC_COMMON_RTC_H_

+#include <locale.h>
#include <sstream>
#include <string>

@@ -46,7 +47,10 @@ class CudaRTCFunction {
// coding it?
const char* nvrtc_opts[] = {
"--gpu-architecture=compute_35", "--use_fast_math"};
+ locale_t oldLocale = uselocale((locale_t) 0);
nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, nvrtc_opts);
+ if (oldLocale != (locale_t) 0)
+ uselocale(oldLocale);
if (compile_result != NVRTC_SUCCESS) {
size_t log_size;
NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
The decorator fails to pass the argument list to the function.
This then fails e.g. test/distributed/checkpoint/test_fsdp_optim_state.py with
> TypeError: FsdpOptimStateCheckpoint.test_load_sharded_optimizer_state_dict() missing 1 required positional argument: 'pass_planner'

Author: Alexander Grund (TU Dresden)

diff --git a/torch/testing/_internal/distributed/checkpoint_utils.py b/torch/testing/_internal/distributed/checkpoint_utils.py
index 1a6e43a038c..52f79b37bfd 100644
--- a/torch/testing/_internal/distributed/checkpoint_utils.py
+++ b/torch/testing/_internal/distributed/checkpoint_utils.py
@@ -31,7 +31,7 @@ def with_temp_dir(
self.temp_dir = object_list[0]

try:
- func(self)
+ func(self, *args, **kwargs)
finally:
if dist.get_rank() == 0:
shutil.rmtree(self.temp_dir, ignore_errors=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
From 1d1308649298caf8884970fc57ed82a2d8ea6079 Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Tue, 26 Dec 2023 17:48:11 -0800
Subject: [PATCH] [BE] force DTensorTestBase.build_device_mesh to use
world_size rather than NUM_DEVICES constant (#116439)

**Test**:
`python test/distributed/fsdp/test_shard_utils.py -k test_create_chunk_dtensor`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/116439
Approved by: https://github.com/wanchaol
---
torch/testing/_internal/distributed/_tensor/common_dtensor.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index ab86ecd1616a74f..05a3c0872878965 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -192,7 +192,7 @@ def backend(self) -> str:
return PG_BACKEND

def build_device_mesh(self) -> DeviceMesh:
- return DeviceMesh(DEVICE_TYPE, list(range(NUM_DEVICES)))
+ return DeviceMesh(DEVICE_TYPE, list(range(self.world_size)))

def init_pg(self) -> None:
if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
test_fsdp_tp_checkpoint_integration in distributed/fsdp/test_fsdp_tp_integration.py
fails due to a regression. See https://github.com/pytorch/pytorch/issues/101162

> RuntimeError: Error(s) in loading state_dict for FullyShardedDataParallel:
> size mismatch for _fsdp_wrapped_module.net1.weight: copying a param with shape torch.Size([4, 5]) from checkpoint, the shape in current model is torch.Size([8, 5]).
> size mismatch for _fsdp_wrapped_module.net1.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([8]).
> size mismatch for _fsdp_wrapped_module.net2.weight: copying a param with shape torch.Size([4, 4]) from checkpoint, the shape in current model is torch.Size([4, 8]).

Skip the test. This should be fixed already for 2.2.x

Author: Alexander Grund (TU Dresden)

diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py
index bc7a4aef4a3..aea16a1f1fb 100644
--- a/test/distributed/fsdp/test_fsdp_tp_integration.py
+++ b/test/distributed/fsdp/test_fsdp_tp_integration.py
@@ -3,6 +3,7 @@ import copy
import sys
from collections import OrderedDict
from typing import Any, Dict, List, Optional, Tuple
+import unittest

import torch
from torch import distributed as dist
@@ -306,7 +307,7 @@ class TestTPFSDPIntegration(FSDPTest):
tp_fsdp_out = tp_fsdp_model(inp)
self.assertEqual(fsdp_out, tp_fsdp_out)

- @skip_if_lt_x_gpu(4)
+ @unittest.skip("Known failure: #101162")
def test_fsdp_tp_checkpoint_integration(self):
"""Tests checkpointing for TP + FSDP integration."""
self.assertTrue(