PaddlePaddle · luotao1 · Dec 17, 2024 · Dec 5, 2024 · Dec 7, 2024 · Dec 7, 2024
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
@@ -2424,8 +2424,26 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("device_memory_stat_current_value",
         memory::DeviceMemoryStatCurrentValue);
   m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
+  m.def("device_memory_stat_reset_peak_value",
+        memory::DeviceMemoryStatResetPeakValue);
+
+  m.def("device_memory_stats", [](int dev_id) {
+    py::dict dict;
+    dict["memory.allocated.current"] =
+        memory::DeviceMemoryStatCurrentValue("Allocated", dev_id);
+    dict["memory.allocated.peak"] =
+        memory::DeviceMemoryStatPeakValue("Allocated", dev_id);
+    dict["memory.reserved.current"] =
+        memory::DeviceMemoryStatCurrentValue("Reserved", dev_id);
+    dict["memory.reserved.peak"] =
+        memory::DeviceMemoryStatPeakValue("Reserved", dev_id);
+    return dict;
+  });
+
   m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue);
   m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue);
+  m.def("host_memory_stat_reset_peak_value",
+        memory::HostMemoryStatResetPeakValue);
   m.def(
       "run_cmd",
       [](const std::string &cmd,

diff --git a/paddle/phi/core/memory/stats.cc b/paddle/phi/core/memory/stats.cc
@@ -58,6 +58,10 @@ class StatRegistry {
     GetStat(stat_type, dev_id)->Update(increment);
   }
 
+  void ResetPeakValue(const std::string& stat_type, int dev_id) {
+    GetStat(stat_type, dev_id)->ResetPeakValue();
+  }
+
   void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
     std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
     stat_map_[GetStatKey(stat_type, dev_id)] = stat;
@@ -93,6 +97,10 @@ void DeviceMemoryStatUpdate(const std::string& stat_type,
   StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment);
 }
 
+void DeviceMemoryStatResetPeakValue(const std::string& stat_type, int dev_id) {
+  StatRegistry::GetInstance()->ResetPeakValue("Device" + stat_type, dev_id);
+}
+
 int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
   return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type,
                                                       dev_id);
@@ -108,6 +116,10 @@ void HostMemoryStatUpdate(const std::string& stat_type,
   StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
 }
 
+void HostMemoryStatResetPeakValue(const std::string& stat_type, int dev_id) {
+  StatRegistry::GetInstance()->ResetPeakValue("Host" + stat_type, dev_id);
+}
+
 void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name) {
   if (FLAGS_log_memory_stats && phi::is_gpu_place(place)) {
     VLOG(0) << "After launching op_name: " << op_name << ", "

diff --git a/paddle/phi/core/memory/stats.h b/paddle/phi/core/memory/stats.h
@@ -57,6 +57,7 @@ class StatBase {
   virtual int64_t GetCurrentValue() = 0;
   virtual int64_t GetPeakValue() = 0;
   virtual void Update(int64_t) = 0;
+  virtual void ResetPeakValue() = 0;
 
  private:
   DISABLE_COPY_AND_ASSIGN(StatBase);
@@ -112,6 +113,22 @@ class Stat : public StatBase {
     }
   }
 
+  void ResetPeakValue() override {
+    int64_t current_value = GetCurrentValue();
+    peak_value_.store(current_value, std::memory_order_relaxed);
+
+    std::unordered_map<uint64_t, std::reference_wrapper<ThreadLocalStatType>>
+        thread_local_stats =
+            ThreadDataRegistry<ThreadLocalStatType>::GetInstance()
+                .GetAllThreadDataByRef();
+
+    for (auto pair : thread_local_stats) {
+      pair.second.get().peak = pair.second.get().current;
+    }
+
+    VLOG(8) << "Reset peak_value to current_value = " << current_value;
+  }
+
  private:
   Stat() {}
   ~Stat() {}
@@ -128,12 +145,14 @@ int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
 void DeviceMemoryStatUpdate(const std::string& stat_type,
                             int dev_id,
                             int64_t increment);
+void DeviceMemoryStatResetPeakValue(const std::string& stat_type, int dev_id);
 
 int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
 int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
 void HostMemoryStatUpdate(const std::string& stat_type,
                           int dev_id,
                           int64_t increment);
+void HostMemoryStatResetPeakValue(const std::string& stat_type, int dev_id);
 
 void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
 
@@ -179,6 +198,8 @@ void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
   DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue)
 #define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
   DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
+#define DEVICE_MEMORY_STAT_RESET_PEAK_VALUE(item, id) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, ResetPeakValue)
 
 #define HOST_MEMORY_STAT_FUNC(item, id, func, ...)                             \
   [&] {                                                                        \
@@ -199,6 +220,8 @@ void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
   HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue)
 #define HOST_MEMORY_STAT_UPDATE(item, id, increment) \
   HOST_MEMORY_STAT_FUNC(item, id, Update, increment)
+#define HOST_MEMORY_STAT_RESET_PEAK_VALUE(item, id) \
+  HOST_MEMORY_STAT_FUNC(item, id, ResetPeakValue)
 
 #define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \
   struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {}

diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
@@ -48,6 +48,10 @@
     'get_device_properties',
     'get_device_name',
     'get_device_capability',
+    'reset_peak_memory_stats',
+    'reset_max_memory_allocated',
+    'reset_max_memory_reserved',
+    'memory_stats',
 ]
 
 
@@ -298,6 +302,142 @@ def max_memory_reserved(device: _CudaPlaceLike | None = None) -> int:
     return core.device_memory_stat_peak_value("Reserved", device_id)
 
 
+def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None:
+    '''
+    Reset the peak values of GPU memory allocated and reserved to the current values.
+
+    This function resets the "peak" stats tracked by the CUDA memory allocator for both
+    memory allocated and memory reserved. After calling this function, the peak values
+    will be set to the current memory usage values.
+
+    Args:
+        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> paddle.device.cuda.reset_peak_memory_stats(paddle.CUDAPlace(0))
+            >>> paddle.device.cuda.reset_peak_memory_stats(0)
+            >>> paddle.device.cuda.reset_peak_memory_stats("gpu:0")
+    '''
+    name = "paddle.device.cuda.reset_peak_memory_stats"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    core.device_memory_stat_reset_peak_value("Allocated", device_id)
+    core.device_memory_stat_reset_peak_value("Reserved", device_id)
+
+
+def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None:
+    '''
+    Reset the peak values of GPU memory allocated to the current values.
+    Allocated memory refers to the GPU memory that is currently allocated to tensors.
+
+    Args:
+        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> paddle.device.cuda.reset_max_memory_allocated(paddle.CUDAPlace(0))
+            >>> paddle.device.cuda.reset_max_memory_allocated(0)
+            >>> paddle.device.cuda.reset_max_memory_allocated("gpu:0")
+    '''
+
+    name = "paddle.device.cuda.reset_max_memory_allocated"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    core.device_memory_stat_reset_peak_value("Allocated", device_id)
+
+
+def reset_max_memory_reserved(device: _CudaPlaceLike | None = None) -> None:
+    '''
+    Reset the peak values of GPU memory reserved to the current values.
+    Reserved memory refers to the GPU memory that is held by the allocator of the given device.
+
+    Args:
+        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> paddle.device.cuda.reset_max_memory_reserved(paddle.CUDAPlace(0))
+            >>> paddle.device.cuda.reset_max_memory_reserved(0)
+            >>> paddle.device.cuda.reset_max_memory_reserved("gpu:0")
+    '''
+
+    name = "paddle.device.cuda.reset_max_memory_reserved"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    core.device_memory_stat_reset_peak_value("Reserved", device_id)
+
+
+def memory_stats(device: _CudaPlaceLike | None = None) -> dict:
+    '''
+    Return the memory stats of the given device.
+
+    Args:
+        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Return:
+        dict: The current memory stats of the given device, including the current size of GPU memory that is allocated to tensor,
+        the peak size of GPU memory that is allocated to tensor, the current size of GPU memory that is held by the allocator and
+        the peak size of GPU memory that is held by the allocator, in bytes.
+
+        memory_stats["memory.allocated.current"]
+        memory_stats["memory.allocated.peak"]
+        memory_stats["memory.reserved.current"]
+        memory_stats["memory.reserved.peak"]
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> memory_stats = paddle.device.cuda.memory_stats(paddle.CUDAPlace(0))
+            >>> memory_stats = paddle.device.cuda.memory_stats(0)
+            >>> memory_stats = paddle.device.cuda.memory_stats("gpu:0")
+    '''
+
+    name = "paddle.device.cuda.memory_stats"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    return core.device_memory_stats(device_id)
+
+
 def memory_allocated(device: _CudaPlaceLike | None = None) -> int:
     '''
     Return the current size of gpu memory that is allocated to tensor of the given device.

diff --git a/test/cpp/fluid/memory/stats_test.cc b/test/cpp/fluid/memory/stats_test.cc
@@ -32,15 +32,18 @@ class StatsTest : public ::testing::Test {
   void SetFunc(
       std::function<void(const std::string, int, int64_t)> update_func,
       std::function<int64_t(const std::string, int)> current_value_func,
-      std::function<int64_t(const std::string, int)> peak_value_func) {
+      std::function<int64_t(const std::string, int)> peak_value_func,
+      std::function<void(const std::string, int)> reset_peak_value_func) {
     update_func_ = update_func;
     current_value_func_ = current_value_func;
     peak_value_func_ = peak_value_func;
+    reset_peak_value_func_ = reset_peak_value_func;
   }
 
   void RunTests() {
     MultiThreadReadWriteTest();
     PeakValueTest();
+    ResetPeakValueTest();
   }
 
  private:
@@ -94,6 +97,18 @@ class StatsTest : public ::testing::Test {
     EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value);
   }
 
+  void ResetPeakValueTest() {
+    for (int64_t data : datas_) {
+      update_func_(stat_type_, 0, data);
+
+      EXPECT_GE(peak_value_func_(stat_type_, 0),
+                current_value_func_(stat_type_, 0));
+      reset_peak_value_func_(stat_type_, 0);
+      EXPECT_EQ(peak_value_func_(stat_type_, 0),
+                current_value_func_(stat_type_, 0));
+    }
+  }
+
   std::string stat_type_;
   std::vector<int64_t> datas_{
       543149808935355, 634698327471328, 706215795436611, 577939367795333,
@@ -125,13 +140,15 @@ class StatsTest : public ::testing::Test {
   std::function<void(const std::string, int, int64_t)> update_func_;
   std::function<int64_t(const std::string, int)> current_value_func_;
   std::function<int64_t(const std::string, int)> peak_value_func_;
+  std::function<void(const std::string, int)> reset_peak_value_func_;
 };
 
 TEST_F(StatsTest, DeviceAllocatedTest) {
   SetStatType("Allocated");
   SetFunc(DeviceMemoryStatUpdate,
           DeviceMemoryStatCurrentValue,
-          DeviceMemoryStatPeakValue);
+          DeviceMemoryStatPeakValue,
+          DeviceMemoryStatResetPeakValue);
   RunTests();
 }
 
@@ -146,6 +163,9 @@ TEST_F(StatsTest, DeviceReservedMacroTest) {
       },
       [](const std::string stat_type, int id) {
         return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id);
+      },
+      [](const std::string stat_type, int id) {
+        return DEVICE_MEMORY_STAT_RESET_PEAK_VALUE(Reserved, id);
       });
   RunTests();
 }
@@ -161,6 +181,9 @@ TEST_F(StatsTest, HostAllocatedMacroTest) {
       },
       [](const std::string stat_type, int id) {
         return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id);
+      },
+      [](const std::string stat_type, int id) {
+        return HOST_MEMORY_STAT_RESET_PEAK_VALUE(Allocated, id);
       });
   RunTests();
 }
@@ -169,7 +192,8 @@ TEST_F(StatsTest, HostReservedTest) {
   SetStatType("Reserved");
   SetFunc(HostMemoryStatUpdate,
           HostMemoryStatCurrentValue,
-          HostMemoryStatPeakValue);
+          HostMemoryStatPeakValue,
+          HostMemoryStatResetPeakValue);
   RunTests();
 }