From b5813298b417ada8019f728ec62ad2780e9c9186 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 24 Mar 2021 16:18:08 -0700
Subject: [PATCH 1/4] see_memory_usage fixes

---
 deepspeed/runtime/utils.py         | 7 +++++++
 docs/_tutorials/getting-started.md | 4 ++--
 docs/_tutorials/pipeline.md        | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index bfacc0af512a..9bffd49d26b6 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -8,6 +8,7 @@
 
 import os
 import psutil
+import gc
 from math import ceil
 from math import floor
 from bisect import bisect_left, bisect_right
@@ -564,6 +565,12 @@ def see_memory_usage(message, force=False):
     logger.info(
         f'CPU Virtual Memory:  used = {used_GB} GB, percent = {vm_stats.percent}%')
 
+    # reset for the next call
+    # 1. get the peak memory to report correct data
+    torch.cuda.reset_peak_memory_stats()
+    # 2. python doesn't do real-time garbage collection so do it explicitly
+    gc.collect()
+
 
 def call_to_str(base, *args, **kwargs):
     """Construct a string representation of a call.
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index e12388aaf973..e9b9aa0e627e 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -265,8 +265,8 @@ local machine to discover the number of slots available. The `--include` and
 `--exclude` arguments work as normal, but the user should specify 'localhost'
 as the hostname.
 
-Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control 
-which devices should be used. For example, to use only gpu1 of the current 
+Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control
+which devices should be used. For example, to use only gpu1 of the current
 node, do:
 ```bash
 deepspeed --include localhost:1 ...
diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md
index 0d847ea18752..1751846830ef 100644
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
@@ -277,7 +277,7 @@ For example, a machine with 16 GPUs must have as much local CPU memory as 16 tim
 DeepSpeed provides a `LayerSpec` class that delays the construction of
 modules until the model layers have been partitioned across workers.
 Then each worker will allocate only the layers it's assigned to. So, comparing to the
-example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to 
+example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to
 allocate a total of 1x model size on its CPU memory and not 16x.
 
 Here is an example of the abbreviated AlexNet model, but expressed only

From d0807b238389f426bad603c68fb988e7e3a2bc69 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 24 Mar 2021 17:15:38 -0700
Subject: [PATCH 2/4] didn't expect pt-1.2

---
 deepspeed/runtime/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 9bffd49d26b6..9129356167f5 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -567,7 +567,8 @@ def see_memory_usage(message, force=False):
 
     # reset for the next call
     # 1. get the peak memory to report correct data
-    torch.cuda.reset_peak_memory_stats()
+    if hasattr(torch.cuda, "reset_peak_memory_stats"):  # pytorch 1.4+
+        torch.cuda.reset_peak_memory_stats()
     # 2. python doesn't do real-time garbage collection so do it explicitly
     gc.collect()
 

From 072e4ab64db877e62b7107c8d8888b8c8f7dbd8d Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 24 Mar 2021 17:24:13 -0700
Subject: [PATCH 3/4] fix the order of things

---
 deepspeed/runtime/utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 9129356167f5..201d6b0edb17 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -560,17 +560,17 @@ def see_memory_usage(message, force=False):
         CA {round(torch.cuda.memory_cached() / (1024 * 1024 * 1024),2)} GB \
         Max_CA {round(torch.cuda.max_memory_cached() / (1024 * 1024 * 1024))} GB ")
 
+    # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
+    gc.collect()
+
     vm_stats = psutil.virtual_memory()
     used_GB = round(((vm_stats.total - vm_stats.available) / (1024**3)), 2)
     logger.info(
         f'CPU Virtual Memory:  used = {used_GB} GB, percent = {vm_stats.percent}%')
 
-    # reset for the next call
-    # 1. get the peak memory to report correct data
+    # get the peak memory to report correct data, so reset the counter for the next call
     if hasattr(torch.cuda, "reset_peak_memory_stats"):  # pytorch 1.4+
         torch.cuda.reset_peak_memory_stats()
-    # 2. python doesn't do real-time garbage collection so do it explicitly
-    gc.collect()
 
 
 def call_to_str(base, *args, **kwargs):

From 9e436f7a6b7e9d7c8acbfa7d1e377b133c759812 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 24 Mar 2021 17:27:23 -0700
Subject: [PATCH 4/4] fix the order of things

---
 deepspeed/runtime/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 201d6b0edb17..b1a7a4b0aae1 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -552,6 +552,9 @@ def see_memory_usage(message, force=False):
     if torch.distributed.is_initialized() and not torch.distributed.get_rank() == 0:
         return
 
+    # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
+    gc.collect()
+
     # Print message except when distributed but not rank 0
     logger.info(message)
     logger.info(
@@ -560,9 +563,6 @@ def see_memory_usage(message, force=False):
         CA {round(torch.cuda.memory_cached() / (1024 * 1024 * 1024),2)} GB \
         Max_CA {round(torch.cuda.max_memory_cached() / (1024 * 1024 * 1024))} GB ")
 
-    # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
-    gc.collect()
-
     vm_stats = psutil.virtual_memory()
     used_GB = round(((vm_stats.total - vm_stats.available) / (1024**3)), 2)
     logger.info(