PaddlePaddle · luotao1 · Dec 17, 2024 · Dec 5, 2024 · Dec 7, 2024 · Dec 7, 2024
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
@@ -2424,8 +2424,13 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("device_memory_stat_current_value",
         memory::DeviceMemoryStatCurrentValue);
   m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
+  m.def("device_memory_stat_reset_peak_value",
+        memory::DeviceMemoryStatResetPeakValue);
+
   m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue);
   m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue);
+  m.def("host_memory_stat_reset_peak_value",
+        memory::HostMemoryStatResetPeakValue);
   m.def(
       "run_cmd",
       [](const std::string &cmd,

diff --git a/paddle/phi/core/memory/stats.cc b/paddle/phi/core/memory/stats.cc
@@ -58,6 +58,10 @@ class StatRegistry {
     GetStat(stat_type, dev_id)->Update(increment);
   }
 
+  void ResetPeakValue(const std::string& stat_type, int dev_id) {
+    GetStat(stat_type, dev_id)->ResetPeakValue();
+  }
+
   void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
     std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
     stat_map_[GetStatKey(stat_type, dev_id)] = stat;
@@ -93,6 +97,10 @@ void DeviceMemoryStatUpdate(const std::string& stat_type,
   StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment);
 }
 
+void DeviceMemoryStatResetPeakValue(const std::string& stat_type, int dev_id) {
+  StatRegistry::GetInstance()->ResetPeakValue("Device" + stat_type, dev_id);
+}
+
 int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
   return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type,
                                                       dev_id);
@@ -108,6 +116,10 @@ void HostMemoryStatUpdate(const std::string& stat_type,
   StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
 }
 
+void HostMemoryStatResetPeakValue(const std::string& stat_type, int dev_id) {
+  StatRegistry::GetInstance()->ResetPeakValue("Host" + stat_type, dev_id);
+}
+
 void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name) {
   if (FLAGS_log_memory_stats && phi::is_gpu_place(place)) {
     VLOG(0) << "After launching op_name: " << op_name << ", "

diff --git a/paddle/phi/core/memory/stats.h b/paddle/phi/core/memory/stats.h
@@ -57,6 +57,7 @@ class StatBase {
   virtual int64_t GetCurrentValue() = 0;
   virtual int64_t GetPeakValue() = 0;
   virtual void Update(int64_t) = 0;
+  virtual void ResetPeakValue() = 0;
 
  private:
   DISABLE_COPY_AND_ASSIGN(StatBase);
@@ -112,6 +113,22 @@ class Stat : public StatBase {
     }
   }
 
+  void ResetPeakValue() override {
+    int64_t current_value = GetCurrentValue();
+    peak_value_.store(current_value, std::memory_order_relaxed);
+
+    std::unordered_map<uint64_t, std::reference_wrapper<ThreadLocalStatType>>
+        thread_local_stats =
+            ThreadDataRegistry<ThreadLocalStatType>::GetInstance()
+                .GetAllThreadDataByRef();
+
+    for (auto pair : thread_local_stats) {
+      pair.second.get().peak = pair.second.get().current;
+    }
+
+    VLOG(8) << "Reset peak_value to current_value = " << current_value;
+  }
+
  private:
   Stat() {}
   ~Stat() {}
@@ -128,12 +145,14 @@ int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
 void DeviceMemoryStatUpdate(const std::string& stat_type,
                             int dev_id,
                             int64_t increment);
+void DeviceMemoryStatResetPeakValue(const std::string& stat_type, int dev_id);
 
 int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
 int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
 void HostMemoryStatUpdate(const std::string& stat_type,
                           int dev_id,
                           int64_t increment);
+void HostMemoryStatResetPeakValue(const std::string& stat_type, int dev_id);
 
 void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
 
@@ -179,6 +198,8 @@ void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
   DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue)
 #define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
   DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
+#define DEVICE_MEMORY_STAT_RESET_PEAK_VALUE(item, id) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, ResetPeakValue)
 
 #define HOST_MEMORY_STAT_FUNC(item, id, func, ...)                             \
   [&] {                                                                        \
@@ -199,6 +220,8 @@ void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
   HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue)
 #define HOST_MEMORY_STAT_UPDATE(item, id, increment) \
   HOST_MEMORY_STAT_FUNC(item, id, Update, increment)
+#define HOST_MEMORY_STAT_RESET_PEAK_VALUE(item, id) \
+  HOST_MEMORY_STAT_FUNC(item, id, ResetPeakValue)
 
 #define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \
   struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {}

diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
@@ -48,6 +48,8 @@
     'get_device_properties',
     'get_device_name',
     'get_device_capability',
+    'reset_max_memory_allocated',
+    'reset_max_memory_reserved',
 ]
 
 
@@ -298,6 +300,66 @@ def max_memory_reserved(device: _CudaPlaceLike | None = None) -> int:
     return core.device_memory_stat_peak_value("Reserved", device_id)
 
 
+def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None:
+    '''
+    Reset the peak size of GPU memory that is allocated to tensor of the given device.
+
+    Args:
+        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> paddle.device.cuda.reset_max_memory_allocated(paddle.CUDAPlace(0))
+            >>> paddle.device.cuda.reset_max_memory_allocated(0)
+            >>> paddle.device.cuda.reset_max_memory_allocated("gpu:0")
+    '''
+
+    name = "paddle.device.cuda.reset_max_memory_allocated"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    core.device_memory_stat_reset_peak_value("Allocated", device_id)
+
+
+def reset_max_memory_reserved(device: _CudaPlaceLike | None = None) -> None:
+    '''
+    Reset the peak size of GPU memory that is held by the allocator of the given device.
+
+    Args:
+        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> paddle.device.cuda.reset_max_memory_reserved(paddle.CUDAPlace(0))
+            >>> paddle.device.cuda.reset_max_memory_reserved(0)
+            >>> paddle.device.cuda.reset_max_memory_reserved("gpu:0")
+    '''
+
+    name = "paddle.device.cuda.reset_max_memory_reserved"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    core.device_memory_stat_reset_peak_value("Reserved", device_id)
+
+
 def memory_allocated(device: _CudaPlaceLike | None = None) -> int:
     '''
     Return the current size of gpu memory that is allocated to tensor of the given device.

diff --git a/test/cpp/fluid/memory/stats_test.cc b/test/cpp/fluid/memory/stats_test.cc
@@ -32,15 +32,18 @@ class StatsTest : public ::testing::Test {
   void SetFunc(
       std::function<void(const std::string, int, int64_t)> update_func,
       std::function<int64_t(const std::string, int)> current_value_func,
-      std::function<int64_t(const std::string, int)> peak_value_func) {
+      std::function<int64_t(const std::string, int)> peak_value_func,
+      std::function<void(const std::string, int)> reset_peak_value_func) {
     update_func_ = update_func;
     current_value_func_ = current_value_func;
     peak_value_func_ = peak_value_func;
+    reset_peak_value_func_ = reset_peak_value_func;
   }
 
   void RunTests() {
     MultiThreadReadWriteTest();
     PeakValueTest();
+    ResetPeakValueTest();
   }
 
  private:
@@ -94,6 +97,18 @@ class StatsTest : public ::testing::Test {
     EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value);
   }
 
+  void ResetPeakValueTest() {
+    for (int64_t data : datas_) {
+      update_func_(stat_type_, 0, data);
+
+      EXPECT_GE(peak_value_func_(stat_type_, 0),
+                current_value_func_(stat_type_, 0));
+      reset_peak_value_func_(stat_type_, 0);
+      EXPECT_EQ(peak_value_func_(stat_type_, 0),
+                current_value_func_(stat_type_, 0));
+    }
+  }
+
   std::string stat_type_;
   std::vector<int64_t> datas_{
       543149808935355, 634698327471328, 706215795436611, 577939367795333,
@@ -125,13 +140,15 @@ class StatsTest : public ::testing::Test {
   std::function<void(const std::string, int, int64_t)> update_func_;
   std::function<int64_t(const std::string, int)> current_value_func_;
   std::function<int64_t(const std::string, int)> peak_value_func_;
+  std::function<void(const std::string, int)> reset_peak_value_func_;
 };
 
 TEST_F(StatsTest, DeviceAllocatedTest) {
   SetStatType("Allocated");
   SetFunc(DeviceMemoryStatUpdate,
           DeviceMemoryStatCurrentValue,
-          DeviceMemoryStatPeakValue);
+          DeviceMemoryStatPeakValue,
+          DeviceMemoryStatResetPeakValue);
   RunTests();
 }
 
@@ -146,6 +163,9 @@ TEST_F(StatsTest, DeviceReservedMacroTest) {
       },
       [](const std::string stat_type, int id) {
         return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id);
+      },
+      [](const std::string stat_type, int id) {
+        return DEVICE_MEMORY_STAT_RESET_PEAK_VALUE(Reserved, id);
       });
   RunTests();
 }
@@ -161,6 +181,9 @@ TEST_F(StatsTest, HostAllocatedMacroTest) {
       },
       [](const std::string stat_type, int id) {
         return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id);
+      },
+      [](const std::string stat_type, int id) {
+        return HOST_MEMORY_STAT_RESET_PEAK_VALUE(Allocated, id);
       });
   RunTests();
 }
@@ -169,7 +192,8 @@ TEST_F(StatsTest, HostReservedTest) {
   SetStatType("Reserved");
   SetFunc(HostMemoryStatUpdate,
           HostMemoryStatCurrentValue,
-          HostMemoryStatPeakValue);
+          HostMemoryStatPeakValue,
+          HostMemoryStatResetPeakValue);
   RunTests();
 }
 

diff --git a/test/legacy_test/test_cuda_reset_max_memory_allocated.py b/test/legacy_test/test_cuda_reset_max_memory_allocated.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.base import core
+from paddle.device.cuda import (
+    device_count,
+    max_memory_allocated,
+    memory_allocated,
+    reset_max_memory_allocated,
+)
+
+
+class TestResetMaxMemoryAllocated(unittest.TestCase):
+    def func_test_reset_max_memory_allocated(self, device=None):
+        if core.is_compiled_with_cuda():
+            alloc_time = 100
+            max_alloc_size = 10000
+            for i in range(alloc_time):
+                # first alloc
+                shape = paddle.randint(
+                    low=max_alloc_size, high=max_alloc_size * 2
+                )
+                tensor = paddle.zeros(shape)
+                peak_memory_allocated_size_first = max_memory_allocated(device)
+
+                del shape
+                del tensor
+
+                # second alloc
+                shape = paddle.randint(low=0, high=max_alloc_size)
+                tensor = paddle.zeros(shape)
+
+                # reset peak memory stats
+                reset_max_memory_allocated(device)
+
+                peak_memory_allocated_size_second = max_memory_allocated(device)
+                self.assertEqual(
+                    peak_memory_allocated_size_second, memory_allocated(device)
+                )
+                self.assertLess(
+                    peak_memory_allocated_size_second,
+                    peak_memory_allocated_size_first,
+                )
+
+                del shape
+                del tensor
+
+    def test_reset_max_memory_allocated_for_all_places(self):
+        if core.is_compiled_with_cuda():
+            gpu_num = device_count()
+            for i in range(gpu_num):
+                paddle.device.set_device("gpu:" + str(i))
+                self.func_test_reset_max_memory_allocated(core.CUDAPlace(i))
+                self.func_test_reset_max_memory_allocated(i)
+                self.func_test_reset_max_memory_allocated("gpu:" + str(i))
+
+    def test_reset_max_memory_allocated_exception(self):
+        if core.is_compiled_with_cuda():
+            wrong_device = [
+                core.CPUPlace(),
+                device_count() + 1,
+                -2,
+                0.5,
+                "gpu1",
+            ]
+            for device in wrong_device:
+                with self.assertRaises(BaseException):  # noqa: B017
+                    reset_max_memory_allocated(device)
+        else:
+            with self.assertRaises(ValueError):
+                reset_max_memory_allocated()
+
+
+if __name__ == "__main__":
+    unittest.main()