Skip to content

Commit

Permalink
【Hackathon 7th No.21】为 Paddle 新增 reset_max_memory_reserved/reset_max_…
Browse files Browse the repository at this point in the history
…memory_allocated API (#70032)
  • Loading branch information
Qin-sx authored Dec 17, 2024
1 parent 21eec3d commit 202aff3
Show file tree
Hide file tree
Showing 7 changed files with 307 additions and 3 deletions.
5 changes: 5 additions & 0 deletions paddle/fluid/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2565,8 +2565,13 @@ All parameter, weight, gradient are variables in Paddle.
m.def("device_memory_stat_current_value",
memory::DeviceMemoryStatCurrentValue);
m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
m.def("device_memory_stat_reset_peak_value",
memory::DeviceMemoryStatResetPeakValue);

m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue);
m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue);
m.def("host_memory_stat_reset_peak_value",
memory::HostMemoryStatResetPeakValue);
m.def(
"run_cmd",
[](const std::string &cmd,
Expand Down
12 changes: 12 additions & 0 deletions paddle/phi/core/memory/stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ class StatRegistry {
GetStat(stat_type, dev_id)->Update(increment);
}

void ResetPeakValue(const std::string& stat_type, int dev_id) {
GetStat(stat_type, dev_id)->ResetPeakValue();
}

void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
stat_map_[GetStatKey(stat_type, dev_id)] = stat;
Expand Down Expand Up @@ -93,6 +97,10 @@ void DeviceMemoryStatUpdate(const std::string& stat_type,
StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment);
}

void DeviceMemoryStatResetPeakValue(const std::string& stat_type, int dev_id) {
StatRegistry::GetInstance()->ResetPeakValue("Device" + stat_type, dev_id);
}

int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type,
dev_id);
Expand All @@ -108,6 +116,10 @@ void HostMemoryStatUpdate(const std::string& stat_type,
StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
}

void HostMemoryStatResetPeakValue(const std::string& stat_type, int dev_id) {
StatRegistry::GetInstance()->ResetPeakValue("Host" + stat_type, dev_id);
}

void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name) {
if (FLAGS_log_memory_stats && phi::is_gpu_place(place)) {
VLOG(0) << "After launching op_name: " << op_name << ", "
Expand Down
23 changes: 23 additions & 0 deletions paddle/phi/core/memory/stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class StatBase {
virtual int64_t GetCurrentValue() = 0;
virtual int64_t GetPeakValue() = 0;
virtual void Update(int64_t) = 0;
virtual void ResetPeakValue() = 0;

private:
DISABLE_COPY_AND_ASSIGN(StatBase);
Expand Down Expand Up @@ -112,6 +113,22 @@ class Stat : public StatBase {
}
}

void ResetPeakValue() override {
int64_t current_value = GetCurrentValue();
peak_value_.store(current_value, std::memory_order_relaxed);

std::unordered_map<uint64_t, std::reference_wrapper<ThreadLocalStatType>>
thread_local_stats =
ThreadDataRegistry<ThreadLocalStatType>::GetInstance()
.GetAllThreadDataByRef();

for (auto pair : thread_local_stats) {
pair.second.get().peak = pair.second.get().current;
}

VLOG(8) << "Reset peak_value to current_value = " << current_value;
}

private:
Stat() {}
~Stat() {}
Expand All @@ -128,12 +145,14 @@ int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
void DeviceMemoryStatUpdate(const std::string& stat_type,
int dev_id,
int64_t increment);
void DeviceMemoryStatResetPeakValue(const std::string& stat_type, int dev_id);

int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
void HostMemoryStatUpdate(const std::string& stat_type,
int dev_id,
int64_t increment);
void HostMemoryStatResetPeakValue(const std::string& stat_type, int dev_id);

void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);

Expand Down Expand Up @@ -179,6 +198,8 @@ void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue)
#define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
#define DEVICE_MEMORY_STAT_RESET_PEAK_VALUE(item, id) \
DEVICE_MEMORY_STAT_FUNC(item, id, ResetPeakValue)

#define HOST_MEMORY_STAT_FUNC(item, id, func, ...) \
[&] { \
Expand All @@ -199,6 +220,8 @@ void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue)
#define HOST_MEMORY_STAT_UPDATE(item, id, increment) \
HOST_MEMORY_STAT_FUNC(item, id, Update, increment)
#define HOST_MEMORY_STAT_RESET_PEAK_VALUE(item, id) \
HOST_MEMORY_STAT_FUNC(item, id, ResetPeakValue)

#define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \
struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {}
Expand Down
62 changes: 62 additions & 0 deletions python/paddle/device/cuda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
'get_device_properties',
'get_device_name',
'get_device_capability',
'reset_max_memory_allocated',
'reset_max_memory_reserved',
]


Expand Down Expand Up @@ -298,6 +300,66 @@ def max_memory_reserved(device: _CudaPlaceLike | None = None) -> int:
return core.device_memory_stat_peak_value("Reserved", device_id)


def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None:
'''
Reset the peak size of GPU memory that is allocated to tensor of the given device.
Args:
device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None.
Examples:
.. code-block:: python
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> paddle.device.set_device('gpu')
>>> paddle.device.cuda.reset_max_memory_allocated(paddle.CUDAPlace(0))
>>> paddle.device.cuda.reset_max_memory_allocated(0)
>>> paddle.device.cuda.reset_max_memory_allocated("gpu:0")
'''

name = "paddle.device.cuda.reset_max_memory_allocated"
if not core.is_compiled_with_cuda():
raise ValueError(
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id = extract_cuda_device_id(device, op_name=name)
core.device_memory_stat_reset_peak_value("Allocated", device_id)


def reset_max_memory_reserved(device: _CudaPlaceLike | None = None) -> None:
'''
Reset the peak size of GPU memory that is held by the allocator of the given device.
Args:
device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None.
Examples:
.. code-block:: python
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> paddle.device.set_device('gpu')
>>> paddle.device.cuda.reset_max_memory_reserved(paddle.CUDAPlace(0))
>>> paddle.device.cuda.reset_max_memory_reserved(0)
>>> paddle.device.cuda.reset_max_memory_reserved("gpu:0")
'''

name = "paddle.device.cuda.reset_max_memory_reserved"
if not core.is_compiled_with_cuda():
raise ValueError(
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id = extract_cuda_device_id(device, op_name=name)
core.device_memory_stat_reset_peak_value("Reserved", device_id)


def memory_allocated(device: _CudaPlaceLike | None = None) -> int:
'''
Return the current size of gpu memory that is allocated to tensor of the given device.
Expand Down
30 changes: 27 additions & 3 deletions test/cpp/fluid/memory/stats_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,18 @@ class StatsTest : public ::testing::Test {
void SetFunc(
std::function<void(const std::string, int, int64_t)> update_func,
std::function<int64_t(const std::string, int)> current_value_func,
std::function<int64_t(const std::string, int)> peak_value_func) {
std::function<int64_t(const std::string, int)> peak_value_func,
std::function<void(const std::string, int)> reset_peak_value_func) {
update_func_ = update_func;
current_value_func_ = current_value_func;
peak_value_func_ = peak_value_func;
reset_peak_value_func_ = reset_peak_value_func;
}

void RunTests() {
MultiThreadReadWriteTest();
PeakValueTest();
ResetPeakValueTest();
}

private:
Expand Down Expand Up @@ -94,6 +97,18 @@ class StatsTest : public ::testing::Test {
EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value);
}

void ResetPeakValueTest() {
for (int64_t data : datas_) {
update_func_(stat_type_, 0, data);

EXPECT_GE(peak_value_func_(stat_type_, 0),
current_value_func_(stat_type_, 0));
reset_peak_value_func_(stat_type_, 0);
EXPECT_EQ(peak_value_func_(stat_type_, 0),
current_value_func_(stat_type_, 0));
}
}

std::string stat_type_;
std::vector<int64_t> datas_{
543149808935355, 634698327471328, 706215795436611, 577939367795333,
Expand Down Expand Up @@ -125,13 +140,15 @@ class StatsTest : public ::testing::Test {
std::function<void(const std::string, int, int64_t)> update_func_;
std::function<int64_t(const std::string, int)> current_value_func_;
std::function<int64_t(const std::string, int)> peak_value_func_;
std::function<void(const std::string, int)> reset_peak_value_func_;
};

TEST_F(StatsTest, DeviceAllocatedTest) {
SetStatType("Allocated");
SetFunc(DeviceMemoryStatUpdate,
DeviceMemoryStatCurrentValue,
DeviceMemoryStatPeakValue);
DeviceMemoryStatPeakValue,
DeviceMemoryStatResetPeakValue);
RunTests();
}

Expand All @@ -146,6 +163,9 @@ TEST_F(StatsTest, DeviceReservedMacroTest) {
},
[](const std::string stat_type, int id) {
return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id);
},
[](const std::string stat_type, int id) {
return DEVICE_MEMORY_STAT_RESET_PEAK_VALUE(Reserved, id);
});
RunTests();
}
Expand All @@ -161,6 +181,9 @@ TEST_F(StatsTest, HostAllocatedMacroTest) {
},
[](const std::string stat_type, int id) {
return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id);
},
[](const std::string stat_type, int id) {
return HOST_MEMORY_STAT_RESET_PEAK_VALUE(Allocated, id);
});
RunTests();
}
Expand All @@ -169,7 +192,8 @@ TEST_F(StatsTest, HostReservedTest) {
SetStatType("Reserved");
SetFunc(HostMemoryStatUpdate,
HostMemoryStatCurrentValue,
HostMemoryStatPeakValue);
HostMemoryStatPeakValue,
HostMemoryStatResetPeakValue);
RunTests();
}

Expand Down
89 changes: 89 additions & 0 deletions test/legacy_test/test_cuda_reset_max_memory_allocated.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import paddle
from paddle.base import core
from paddle.device.cuda import (
device_count,
max_memory_allocated,
memory_allocated,
reset_max_memory_allocated,
)


class TestResetMaxMemoryAllocated(unittest.TestCase):
def func_test_reset_max_memory_allocated(self, device=None):
if core.is_compiled_with_cuda():
alloc_time = 100
max_alloc_size = 10000
for i in range(alloc_time):
# first alloc
shape = paddle.randint(
low=max_alloc_size, high=max_alloc_size * 2
)
tensor = paddle.zeros(shape)
peak_memory_allocated_size_first = max_memory_allocated(device)

del shape
del tensor

# second alloc
shape = paddle.randint(low=0, high=max_alloc_size)
tensor = paddle.zeros(shape)

# reset peak memory stats
reset_max_memory_allocated(device)

peak_memory_allocated_size_second = max_memory_allocated(device)
self.assertEqual(
peak_memory_allocated_size_second, memory_allocated(device)
)
self.assertLess(
peak_memory_allocated_size_second,
peak_memory_allocated_size_first,
)

del shape
del tensor

def test_reset_max_memory_allocated_for_all_places(self):
if core.is_compiled_with_cuda():
gpu_num = device_count()
for i in range(gpu_num):
paddle.device.set_device("gpu:" + str(i))
self.func_test_reset_max_memory_allocated(core.CUDAPlace(i))
self.func_test_reset_max_memory_allocated(i)
self.func_test_reset_max_memory_allocated("gpu:" + str(i))

def test_reset_max_memory_allocated_exception(self):
if core.is_compiled_with_cuda():
wrong_device = [
core.CPUPlace(),
device_count() + 1,
-2,
0.5,
"gpu1",
]
for device in wrong_device:
with self.assertRaises(BaseException): # noqa: B017
reset_max_memory_allocated(device)
else:
with self.assertRaises(ValueError):
reset_max_memory_allocated()


if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit 202aff3

Please sign in to comment.