From 3ef00b8840c05c49118705f6fd9663ebb951f3a1 Mon Sep 17 00:00:00 2001
From: Andrei Ivanov <andrey.ivanov@gmail.com>
Date: Thu, 16 Jul 2020 16:57:58 -0700
Subject: [PATCH] Refactoring of Pooled Storage Manager classes (#18582)

* Refactoring of Pooled Storage Manager classes

* Adding test for new functionality

* Fixing compilation problems which appear for MXNET_USE_CUDA=0

* Fixing compilation problems for WINDOWS and ANDROID

* Fixing compilation problems which appear for WINDOWS and __APPLE__

* Fixing lint problems

* test_dataloader_context(): Bypassing custom_dev_id pinned mem test on system with GPUs < 2.

* Fixing compilation for Android. Elimination of unused includes.

* Fixing problems with CPUPinned Storage Manager which appears when MXNET_USE_CUDA = 0

* Removing test_bucketing.py

* Imroving CPU_Pinned Pooled Storage Manager case.

* Fixing lint problem

* The GPU profiling commands calls moved into mutex area

* Fixing lint problem

* Improved reporting regarding the Storage Manager used.

* Fixing lint problem

* Trigger CI

* Removing some comments, as suggested by @szha

* Trigger CI

* Trigger CI

Co-authored-by: andreii <andreii@nvidia.com>
---
 docs/static_site/src/pages/api/faq/env_var.md |  67 +-
 src/profiler/storage_profiler.h               |  20 +-
 src/storage/cpu_device_storage.h              |  11 +-
 src/storage/cpu_shared_storage_manager.h      |  16 +-
 src/storage/gpu_device_storage.h              |  37 +-
 src/storage/naive_storage_manager.h           |   1 -
 src/storage/pinned_memory_storage.h           |  20 +-
 src/storage/pooled_storage_manager.h          | 590 ++++++++++--------
 src/storage/storage.cc                        | 269 +++++---
 src/storage/storage_manager.h                 |   2 +-
 src/storage/storage_manager_helpers.h         | 162 +++++
 tests/python/unittest/test_gluon_data.py      |  15 +-
 12 files changed, 732 insertions(+), 478 deletions(-)
 create mode 100644 src/storage/storage_manager_helpers.h

diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md
index b7c9dd66bf4b..364fd1d7de6a 100644
--- a/docs/static_site/src/pages/api/faq/env_var.md
+++ b/docs/static_site/src/pages/api/faq/env_var.md
@@ -85,28 +85,67 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
   - Setting this to a small number can save GPU memory. It will also likely decrease the level of parallelism, which is usually acceptable.
     - MXNet internally uses graph coloring algorithm to [optimize memory consumption]({{'/api/architecture/note_memory'|relative_url}}).
   - This parameter is also used to get number of matching colors in graph and in turn how much parallelism one can get in each GPU. Color based match usually costs more memory but also enables more parallelism.
+* MXNET_GPU_MEM_POOL_TYPE
+  - Values: String ```(default=Naive)```
+  - The type of GPU memory pool.
+  - Choices:
+    - *Naive*: A simple memory pool that allocates memory for the requested size and cache memory buffers, when this memory is released. The size of memory chunk is defined by rounding the requested memory size to the nearest bigger multiple of MXNET_GPU_MEM_POOL_PAGE_SIZE (or MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE, when the result of rounding for MXNET_GPU_MEM_POOL_PAGE_SIZE is bigger than MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE) and allocates memory of the rounded size.
+    - *Round*: A memory pool that try to rounds the requested memory size to the nearest bigger power of 2. When this rounded number is bigger that 2**MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF, the *Naive* rounding algorithm is used. Caching and allocating buffered memory works in the same way as the naive memory pool.
+    - *Unpooled*: No memory pool is used.
 * MXNET_GPU_MEM_POOL_RESERVE
   - Values: Int ```(default=5)```
   - The percentage of GPU memory to reserve for things other than the GPU array, such as kernel launch or cudnn handle space.
+  - The value is used only by the GPU memory pool. If it is not possible to allocate new memory AND still save this reserve, the memory pool will free the cached memory.
   - If you see a strange out-of-memory error from the kernel launch, after multiple iterations, try setting this to a larger value.
-
-* MXNET_GPU_MEM_POOL_TYPE
+* MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE
+  - Values: Int ```(default=2097152)```
+  - When the rounded size of memory allocations calculated by the pool of *Naive* type is larger than this threshold, it will be rounded up to a multiple of this value.
+  - The default was chosen to minimize global memory fragmentation within the GPU driver. Set this to 1 to disable.
+* MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF
+  - Values: Int ```(default=24)```
+  - The cutoff threshold used by *Round* strategy. Let's denote the threshold as T. If the memory size is smaller than `2 ** T` (by default, it's 2 ** 24 = 16MB), it rounds to the smallest `2 ** n` that is larger than the requested memory size; if the memory size is larger than `2 ** T`, it rounds to the next k * 2 ** T.
+* MXNET_CPU_MEM_POOL_TYPE
   - Values: String ```(default=Naive)```
-  - The type of memory pool.
+  - The type of CPU memory pool.
   - Choices:
-    - Naive: A simple memory pool that allocates memory for the exact requested size and cache memory buffers. If a buffered memory chunk matches the size of a new request, the chunk from the memory pool will be returned and reused.
-    - Round: A memory pool that always rounds the requested memory size and allocates memory of the rounded size. MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF defines how to round up a memory size. Caching and allocating buffered memory works in the same way as the naive memory pool.
-    - Unpooled: No memory pool is used.
-
-* MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF
+    - *Naive*: A simple memory pool that allocates memory for the requested size and cache memory buffers, when this memory is released. The size of memory chunk is defined by rounding the requested memory size to the nearest bigger multiple of MXNET_CPU_MEM_POOL_PAGE_SIZE (or MXNET_CPU_MEM_LARGE_ALLOC_ROUND_SIZE, when the result of rounding for MXNET_CPU_MEM_POOL_PAGE_SIZE is bigger than MXNET_CPU_MEM_LARGE_ALLOC_ROUND_SIZE) and allocates memory of the rounded size.
+    - *Round*: A memory pool that try to rounds the requested memory size to the nearest bigger power of 2. When this rounded number is bigger that 2**MXNET_CPU_MEM_POOL_ROUND_LINEAR_CUTOFF, the the *Naive* rounding algorithm is used. Caching and allocating buffered memory works in the same way as the naive memory pool.
+    - *Unpooled*: No memory pool is used.
+* MXNET_CPU_MEM_POOL_RESERVE
+  - Values: Int ```(default=5)```
+  - The percentage of CPU memory to reserve for things other than the CPU array.
+  - The value is used only by the CPU memory pool. If it is not possible to allocate new memory AND still save this reserve, the memory pool will free the cached memory.
+  - If you see a strange out-of-memory error from the kernel launch, after multiple iterations, try setting this to a larger value.
+* MXNET_CPU_MEM_LARGE_ALLOC_ROUND_SIZE
+  - Values: Int ```(default=2097152)```
+  - When the rounded size of memory allocations calculated by the pool of *Naive* type is larger than this threshold, it will be rounded up to a multiple of this value.
+  - Set this to 1 to disable.
+* MXNET_CPU_MEM_POOL_ROUND_LINEAR_CUTOFF
   - Values: Int ```(default=24)```
-  - The cutoff threshold that decides the rounding strategy. Let's denote the threshold as T. If the memory size is smaller than `2 ** T` (by default, it's 2 ** 24 = 16MB), it rounds to the smallest `2 ** n` that is larger than the requested memory size; if the memory size is larger than `2 ** T`, it rounds to the next k * 2 ** T.
-
-* MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE
+  - The cutoff threshold used by *Round* strategy. Let's denote the threshold as T. If the memory size is smaller than `2 ** T` (by default, it's 2 ** 24 = 16MB), it rounds to the smallest `2 ** n` that is larger than the requested memory size; if the memory size is larger than `2 ** T`, it rounds to the next k * 2 ** T.
+* MXNET_CPU_PINNED_MEM_POOL_TYPE
+  - Values: String ```(default=Naive)```
+  - The type of CPU_PINNED memory pool.
+  - Choices:
+    - *Naive*: A simple memory pool that allocates memory for the requested size and cache memory buffers, when this memory is released. The size of memory chunk is defined by rounding the requested memory size to the nearest bigger multiple of MXNET_CPU_PINNED_MEM_POOL_PAGE_SIZE (or MXNET_CPU_PINNED_MEM_LARGE_ALLOC_ROUND_SIZE, when the result of rounding for MXNET_CPU_PINNED_MEM_POOL_PAGE_SIZE is bigger than MXNET_CPU_PINNED_MEM_LARGE_ALLOC_ROUND_SIZE) and allocates memory of the rounded size.
+    - *Round*: A memory pool that try to rounds the requested memory size to the nearest bigger power of 2. When this rounded number is bigger that 2**MXNET_CPU_PINNED_MEM_POOL_ROUND_LINEAR_CUTOFF, the the *Naive* rounding algorithm is used. Caching and allocating buffered memory works in the same way as the naive memory pool.
+    - *Unpooled*: No memory pool is used.
+* MXNET_CPU_PINNED_MEM_POOL_RESERVE
+  - Values: Int ```(default=5)```
+  - The percentage of GPU memory to reserve for things other than the GPU array.
+  - The value is used only by the CPU memory pool. If it is not possible to allocate new memory AND still save this reserve, the memory pool will free the cached memory.
+  - If you see a strange out-of-memory error from the kernel launch, after multiple iterations, try setting this to a larger value.
+* MXNET_CPU_PINNED_MEM_LARGE_ALLOC_ROUND_SIZE
   - Values: Int ```(default=2097152)```
-  - When using the naive pool type, memory allocations larger than this threshhold are rounded up to a multiple of this value.
-  - The default was chosen to minimize global memory fragmentation within the GPU driver.  Set this to 1 to disable.
-
+  - When the rounded size of memory allocations calculated by the pool of *Naive* type is larger than this threshold, it will be rounded up to a multiple of this value.
+  - Set this to 1 to disable.
+* MXNET_CPU_PINNED_MEM_POOL_ROUND_LINEAR_CUTOFF
+  - Values: Int ```(default=24)```
+  - The cutoff threshold used by *Round* strategy. Let's denote the threshold as T. If the memory size is smaller than `2 ** T` (by default, it's 2 ** 24 = 16MB), it rounds to the smallest `2 ** n` that is larger than the requested memory size; if the memory size is larger than `2 ** T`, it rounds to the next k * 2 ** T.
+* MXNET_USE_NAIVE_STORAGE_MANAGERS
+  - Values: Int ```(default=0)```
+  - When value is not 0, no memory pools will be used for any of the following three types of memory: GPU, CPU, CPU_PINNED.
+   
 ## Engine Type
 
 * MXNET_ENGINE_TYPE
diff --git a/src/profiler/storage_profiler.h b/src/profiler/storage_profiler.h
index 5bb7d356a502..b2213e3b94fd 100644
--- a/src/profiler/storage_profiler.h
+++ b/src/profiler/storage_profiler.h
@@ -162,16 +162,17 @@ class GpuDeviceStorageProfiler {
     }
   }
 
+  inline void OnFree(void *dptr) {
+    // In case of bug which tries to free first
+    if (gpu_mem_alloc_entries_.find(dptr) != gpu_mem_alloc_entries_.end())
+      gpu_mem_alloc_entries_.erase(dptr);
+  }
+
   void OnFree(const Storage::Handle &handle) {
     if (handle.size > 0) {
       profiler::Profiler *prof = profiler::Profiler::Get();
-      if (prof->IsProfiling(profiler::Profiler::kMemory)) {
-        // In case of bug which tries to free first
-        if (gpu_mem_alloc_entries_.find(handle.dptr) !=
-            gpu_mem_alloc_entries_.end()) {
-          gpu_mem_alloc_entries_.erase(handle.dptr);
-        }
-      }
+      if (prof->IsProfiling(profiler::Profiler::kMemory))
+        OnFree(handle.dptr);
     }
   }
 
@@ -195,6 +196,11 @@ class GpuDeviceStorageProfiler {
   /*! \brief dump the allocation entries to file */
   void DumpProfile() const;
 
+  bool inline IsProfiling() const {
+    profiler::Profiler *prof = profiler::Profiler::Get();
+    return prof->IsProfiling(profiler::Profiler::kMemory);
+  }
+
  private:
   std::string filename_prefix_ = "gpu_memory_profile";
   /*! \brief Dynamically-sized dictionary of memory profile counters */
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index f6b296a9643f..b81cdbc17e25 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -25,9 +25,6 @@
 #ifndef MXNET_STORAGE_CPU_DEVICE_STORAGE_H_
 #define MXNET_STORAGE_CPU_DEVICE_STORAGE_H_
 
-#include <dmlc/logging.h>
-#include <cstdlib>
-#include <new>
 #include "mxnet/base.h"
 
 namespace mxnet {
@@ -63,15 +60,11 @@ class CPUDeviceStorage {
 };  // class CPUDeviceStorage
 
 inline void CPUDeviceStorage::Alloc(Storage::Handle* handle) {
-  handle->dptr = nullptr;
-  const size_t size = handle->size;
-  if (size == 0) return;
-
 #if _MSC_VER
-  handle->dptr = _aligned_malloc(size, alignment_);
+  handle->dptr = _aligned_malloc(handle->size, alignment_);
   if (handle->dptr == nullptr) LOG(FATAL) << "Failed to allocate CPU Memory";
 #else
-  int ret = posix_memalign(&handle->dptr, alignment_, size);
+  int ret = posix_memalign(&handle->dptr, alignment_, handle->size);
   if (ret != 0) LOG(FATAL) << "Failed to allocate CPU Memory";
 #endif
 }
diff --git a/src/storage/cpu_shared_storage_manager.h b/src/storage/cpu_shared_storage_manager.h
index b301503202e2..e9407b05917d 100644
--- a/src/storage/cpu_shared_storage_manager.h
+++ b/src/storage/cpu_shared_storage_manager.h
@@ -25,23 +25,14 @@
 #ifndef _WIN32
 #include <sys/mman.h>
 #include <sys/fcntl.h>
-#include <unistd.h>
-#include <sys/types.h>
 #include <sys/stat.h>
 #else
 #include <Windows.h>
 #include <process.h>
 #endif  // _WIN32
 
-#include <unordered_map>
-#include <vector>
-#include <atomic>
-#include <iostream>
-#include <mutex>
-#include <new>
 #include <string>
 #include <limits>
-
 #include "./storage_manager.h"
 
 namespace mxnet {
@@ -115,11 +106,6 @@ class CPUSharedStorageManager final : public StorageManager {
 };  // class CPUSharedStorageManager
 
 void CPUSharedStorageManager::Alloc(Storage::Handle* handle) {
-  if (handle->size == 0) {
-    handle->dptr = nullptr;
-    return;
-  }
-
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   std::uniform_int_distribution<> dis(0, std::numeric_limits<int>::max());
   int fid = -1;
@@ -140,7 +126,7 @@ void CPUSharedStorageManager::Alloc(Storage::Handle* handle) {
       map_handle = CreateFileMapping(INVALID_HANDLE_VALUE,
                                      nullptr, PAGE_READWRITE, 0, size, filename.c_str());
       if ((error = GetLastError()) == ERROR_SUCCESS) {
-        break;;
+        break;
       }
     }
   } else {
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index 3eabe1b502be..6b5df04808b7 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -25,14 +25,8 @@
 #ifndef MXNET_STORAGE_GPU_DEVICE_STORAGE_H_
 #define MXNET_STORAGE_GPU_DEVICE_STORAGE_H_
 
-#include "mxnet/base.h"
-#include "mxnet/storage.h"
-#include "../common/cuda_utils.h"
-#include "../profiler/storage_profiler.h"
 #if MXNET_USE_CUDA
-#include <cuda_runtime.h>
-#endif  // MXNET_USE_CUDA
-#include <new>
+#include "mxnet/storage.h"
 
 namespace mxnet {
 namespace storage {
@@ -55,46 +49,25 @@ class GPUDeviceStorage {
 };  // class GPUDeviceStorage
 
 inline void GPUDeviceStorage::Alloc(Storage::Handle* handle) {
-  handle->dptr = nullptr;
-  const size_t size = handle->size;
-  if (size == 0) return;
-
-#if MXNET_USE_CUDA
   mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
 #endif  // MXNET_USE_NCCL
-  cudaError_t e = cudaMalloc(&handle->dptr, size);
-  if (e != cudaSuccess && e != cudaErrorCudartUnloading) {
-    LOG(FATAL) << "CUDA: " << cudaGetErrorString(e);
-  }
-  // record the allocation event in the memory profiler
-  profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, false);
-#else   // MXNET_USE_CUDA
-  LOG(FATAL) << "Please compile with CUDA enabled";
-#endif  // MXNET_USE_CUDA
+  CUDA_CALL(cudaMalloc(&handle->dptr, handle->size));
+  profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, handle->size, false);
 }
 
 inline void GPUDeviceStorage::Free(Storage::Handle handle) {
-#if MXNET_USE_CUDA
   mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
 #endif  // MXNET_USE_NCCL
-  // throw special exception for caller to catch.
-  cudaError_t err = cudaFree(handle.dptr);
-  // ignore unloading error, as memory has already been recycled
-  if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
-    LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
-  }
-  // record the deallocation event in the memory profiler
+  CUDA_CALL(cudaFree(handle.dptr))
   profiler::GpuDeviceStorageProfiler::Get()->OnFree(handle);
-#else   // MXNET_USE_CUDA
-  LOG(FATAL) << "Please compile with CUDA enabled";
-#endif  // MXNET_USE_CUDA
 }
 
 }  // namespace storage
 }  // namespace mxnet
 
+#endif  // MXNET_USE_CUDA
 #endif  // MXNET_STORAGE_GPU_DEVICE_STORAGE_H_
diff --git a/src/storage/naive_storage_manager.h b/src/storage/naive_storage_manager.h
index 471b015eb32c..74b9ea7c522a 100644
--- a/src/storage/naive_storage_manager.h
+++ b/src/storage/naive_storage_manager.h
@@ -26,7 +26,6 @@
 #define MXNET_STORAGE_NAIVE_STORAGE_MANAGER_H_
 
 #include "storage_manager.h"
-#include "mxnet/base.h"
 
 namespace mxnet {
 namespace storage {
diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h
index 5d03fd13a3c9..d8b801154726 100644
--- a/src/storage/pinned_memory_storage.h
+++ b/src/storage/pinned_memory_storage.h
@@ -26,11 +26,7 @@
 #define MXNET_STORAGE_PINNED_MEMORY_STORAGE_H_
 #if MXNET_USE_CUDA
 
-#include <dmlc/logging.h>
-#include "mxnet/base.h"
 #include "mxnet/storage.h"
-#include "../common/cuda_utils.h"
-#include "../profiler/storage_profiler.h"
 
 namespace mxnet {
 namespace storage {
@@ -51,18 +47,12 @@ class PinnedMemoryStorage {
 };
 
 inline void PinnedMemoryStorage::Alloc(Storage::Handle* handle) {
-  handle->dptr = nullptr;
-  const size_t size = handle->size;
-  if (size == 0) return;
-
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
 #endif
   mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
   // make the memory available across all devices
-  CUDA_CALL(cudaHostAlloc(&handle->dptr, size, cudaHostAllocPortable));
-  // record the allocation event in the memory profiler
-  profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, false);
+  CUDA_CALL(cudaHostAlloc(&handle->dptr, handle->size, cudaHostAllocPortable));
 }
 
 inline void PinnedMemoryStorage::Free(Storage::Handle handle) {
@@ -70,13 +60,7 @@ inline void PinnedMemoryStorage::Free(Storage::Handle handle) {
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
 #endif
   mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
-  cudaError_t err = cudaFreeHost(handle.dptr);
-  // ignore unloading error, as memory has already been recycled
-  if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
-    LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
-  }
-  // record the deallocation event in the memory profiler
-  profiler::GpuDeviceStorageProfiler::Get()->OnFree(handle);
+  CUDA_CALL(cudaFreeHost(handle.dptr));
 }
 
 }  // namespace storage
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index c4f32b22bdc2..897725a61a16 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -25,372 +25,418 @@
 #ifndef MXNET_STORAGE_POOLED_STORAGE_MANAGER_H_
 #define MXNET_STORAGE_POOLED_STORAGE_MANAGER_H_
 
-#if MXNET_USE_CUDA
-  #include <cuda_runtime.h>
-#endif  // MXNET_USE_CUDA
-
-#include <mxnet/base.h>
-#include <mxnet/storage.h>
-#include <unordered_map>
-#include <algorithm>
+#include <string>
 #include <vector>
+#include <algorithm>
 #include <mutex>
-#include <new>
+#include <tuple>
 #include "./storage_manager.h"
-#include "../common/cuda_utils.h"
-#include "../common/utils.h"
 #include "../profiler/storage_profiler.h"
 
 
 namespace mxnet {
 namespace storage {
 
+typedef enum {
+  pool_type,
+  pool_page_size,
+  large_alloc_size,
+  round_linear_cutoff,
+  pool_reserve,
+} env_var_type;
+
+const std::string env_var_name(const char* dev_type, env_var_type type);
+
 #if MXNET_USE_CUDA
+#define SET_DEVICE(device_store, contextHelper, ctx, flag) \
+      const auto *device_store = flag? contextHelper.get()->SetCurrentDevice(ctx) : nullptr;
+#define UNSET_DEVICE(device_store)    delete device_store
+
+#define SET_GPU_PROFILER(prof, contextHelper)                          \
+      auto prof = contextHelper->contextGPU()?                         \
+                  profiler::GpuDeviceStorageProfiler::Get() : nullptr; \
+      if (!prof->IsProfiling()) prof = nullptr
+
+#define GPU_PROFILER_ON_FREE(prof, pntr)    if (prof) prof->OnFree(pntr)
+#else
+// empty macros when MxNet is compiled without CUDA support
+#define SET_DEVICE(...)
+#define UNSET_DEVICE(...)
+#define SET_GPU_PROFILER(prof, ...)
+#define GPU_PROFILER_ON_FREE(prof, ...)
+#endif
+
 /*!
- * \brief Storage manager with a memory pool on gpu. Memory chunks are reused based on exact size
- * match.
+ * \brief Storage manager with a memory pool for GPU/CPU/CPUPunned memory chunks
+ * memory chunks which reused based on rounded size match.
+ * Rounding method is defined by the template parameter BucketingStrategy.
+ * Memory pool type is defined by the template parameter StoringMethod
+ * Allocation/freeing of memory is done by contextHelper_, which is the pointer
+ * to one of memory specific instance of the class, derived from ContextHelper
  */
-class GPUPooledStorageManager final : public StorageManager {
+template<typename BucketingStrategy, typename StoringMethod>
+class PooledStorageManager : public StorageManager,
+             public BucketingStrategy, public StoringMethod {
  public:
-  /*!
-   * \brief Default constructor.
-   *
-   * \param initial_context context used by this Storage Manager
-   */
-  explicit GPUPooledStorageManager(Context initial_context) :
-    initial_context_(initial_context) {
-    reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
-    page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
-    large_alloc_round_size_ = dmlc::GetEnv("MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE", 2 * 1024 * 1024);
-    if (large_alloc_round_size_ <= 0) {
-      LOG(FATAL) << "MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE cannot be set to a value <= 0, found: "
-                 << large_alloc_round_size_;
+  explicit PooledStorageManager(const Context &ctx, int num_gpu_device) {
+    const char *dev_type = nullptr;
+    switch (dev_type_ = ctx.dev_type) {
+#if MXNET_USE_CUDA
+      case Context::kGPU:       contextHelper_ = std::make_unique<ContextHelperGPU>();
+                                dev_type = "GPU";
+                                break;
+      case Context::kCPUPinned: dev_type = "CPU_PINNED";
+                                if (num_gpu_device > 1) {
+                                  contextHelper_ = std::make_unique<ContextHelperPinned>();
+                                  dev_type_ = Context::kGPU;
+                                  break;
+                                }
+#else
+      case Context::kCPUPinned: dev_type = "CPU_PINNED";
+#endif
+                                dev_type_ = Context::kCPU;
+      case Context::kCPU:       contextHelper_ = std::make_unique<ContextHelperCPU>();
+                                dev_type = "CPU";
+      default:                  break;
     }
-    if (page_size_ < NDEV) {
-      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than " << NDEV \
-                 << ". Got " << page_size_ << ".";
+
+    BucketingStrategy::InitRoundHelper(dev_type);
+    StoringMethod::InitContainer(this);
+    contextHelper_->set_initilal_context(ctx);
+
+    // percentage of reserved memory
+    if (dev_type) {
+      const auto env_var = env_var_name(dev_type, pool_reserve);
+      const size_t reserve = dmlc::GetEnv(env_var.c_str(), 5);
+      const size_t total = std::get<1>(contextHelper_->getMemoryInfo());
+      memory_allocation_limit_ = total * reserve / 100;
     }
   }
   /*!
    * \brief Default destructor.
    */
-  ~GPUPooledStorageManager() {
+  ~PooledStorageManager() override {
     ReleaseAll();
   }
 
   void Alloc(Storage::Handle* handle) override;
-  void Free(Storage::Handle handle) override;
+  void Free(Storage::Handle handle) override {
+    // Insert returned memory in cache
+    std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(dev_type_));
+    StoringMethod::InsertInCache(BucketingStrategy::get_bucket(handle.size), handle.dptr);
+  }
 
   void DirectFree(Storage::Handle handle) override {
-    std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-    DirectFreeNoLock(handle);
+    std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(dev_type_));
+    SET_DEVICE(device_store, contextHelper_, handle.ctx, true);
+    contextHelper_->Free(handle.dptr);
+    SET_GPU_PROFILER(profilerGPU, contextHelper_);
+    GPU_PROFILER_ON_FREE(profilerGPU, handle.dptr);
+    UNSET_DEVICE(device_store);
+    used_memory_ -= BucketingStrategy::RoundAllocSize(handle.size);
   }
 
-  void ReleaseAll() override;
-
- private:
-  void DirectFreeNoLock(Storage::Handle handle) {
-    mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
-    cudaError_t err = cudaFree(handle.dptr);
-    size_t size = RoundAllocSize(handle.size);
-    // ignore unloading error, as memory has already been recycled
-    if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
-      LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
-    }
-    used_memory_ -= size;
-    profiler::GpuDeviceStorageProfiler::Get()->OnFree(handle);
+  void ReleaseAll() override {
+    std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(dev_type_));
+    ReleaseAllNoLock();
   }
 
-  // Round a value 'x' up to the next multiple of 'multiple'
-  size_t RoundToMultiple(size_t x, size_t multiple) {
-    size_t retVal = ((x + multiple - 1) / multiple) * multiple;
-    return retVal;
+ private:
+  void ReleaseAllNoLock(bool set_device = true) {
+    SET_DEVICE(device_store, contextHelper_, contextHelper_->initilal_context(), set_device);
+    used_memory_ -= StoringMethod::ReleaseAllNoLock(contextHelper_.get(), this);
+    UNSET_DEVICE(device_store);
   }
 
-  size_t RoundAllocSize(size_t size) {
-    // Round up small allocs to multiple of page_size_ to consolidate the pool lookups
-    size = RoundToMultiple(size, page_size_);
-    // To ensure proper freeing under some driver variants, make sure
-    // large allocs entirely occupy their slabs, which cannot then be
-    // locked by smaller permanent allocations sharing the slab.
-    if (size > large_alloc_round_size_)
-      size = RoundToMultiple(size, large_alloc_round_size_);
-    return size;
+  bool MemoryIsAvalable(size_t roundSize) const {
+    const auto free = contextHelper_->freeMemorySize();
+    return free > roundSize && memory_allocation_limit_ <= free - roundSize;
   }
 
- private:
+  // device type of used context
+  Context::DeviceType dev_type_;
   // used memory
   size_t used_memory_ = 0;
-  // page size
-  size_t page_size_;
-  // size that large allocations should be rounded to, for proper freeing.
-  size_t large_alloc_round_size_;
-  // percentage of reserved memory
-  int reserve_;
-  // number of devices
-  const size_t NDEV = 32;
-  // context used by this Storage Manager
-  const Context initial_context_;
-  // memory pool
-  std::unordered_map<size_t, std::vector<void*>> memory_pool_;
-  DISALLOW_COPY_AND_ASSIGN(GPUPooledStorageManager);
-};  // class GPUPooledStorageManager
-
-void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
-  // Set dptr to nullptr when handle size is 0.
-  if (handle->size == 0) {
-    handle->dptr = nullptr;
-    return;
-  }
+  // minimum amount of memory, which will never be allocated
+  size_t memory_allocation_limit_ = 0;
+  // Pointer to the Helper, supporting some context-specific operations in GPU/CPU/CPUPinned context
+  std::unique_ptr<ContextHelper> contextHelper_;
+};
+
+template<typename BucketingStrategy, typename StoringMethod>
+void PooledStorageManager<BucketingStrategy, StoringMethod>::Alloc(Storage::Handle* handle) {
+  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(dev_type_));
+  const auto bucket_id = BucketingStrategy::get_bucket(handle->size);
+  size_t roundSize = 0;
+  auto reuse_pool = StoringMethod::GetMemStorage(bucket_id);
+  if (!reuse_pool) {
+    SET_DEVICE(device_store, contextHelper_, handle->ctx, true);
+    roundSize = BucketingStrategy::RoundAllocSizeForBucket(bucket_id);
+    if (!MemoryIsAvalable(roundSize))
+      ReleaseAllNoLock(false);
+
+    void *ret = nullptr;
+    auto e = contextHelper_->Malloc(&ret, roundSize);
+    if (e) {
+      const std::string err(
+#if MXNET_USE_CUDA
+      dev_type_ == Context::kGPU?
+         cudaGetErrorString(static_cast<cudaError_t>(e)) :
+#endif
+         std::strerror(errno));
 
-  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-  size_t size = RoundAllocSize(handle->size);
-  auto&& reuse_it = memory_pool_.find(size);
-  if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
-    mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
-    size_t free, total;
-    cudaMemGetInfo(&free, &total);
-    if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
-      ReleaseAll();
-
-    void* ret = nullptr;
-    cudaError_t e = cudaMalloc(&ret, size);
-    if (e != cudaSuccess) {
-      if (e == cudaErrorMemoryAllocation) {
-        ReleaseAll();
-        e = cudaMalloc(&ret, size);
-        if (e != cudaSuccess && e != cudaErrorCudartUnloading) {
-          LOG(FATAL) << "cudaMalloc retry failed: " << cudaGetErrorString(e);
-        }
-      } else if (e != cudaErrorCudartUnloading) {
-        LOG(FATAL) << "cudaMalloc failed: " << cudaGetErrorString(e);
-      }
+      LOG(FATAL) << "Memory allocation failed " << err;
     }
-    used_memory_ += size;
+
+    UNSET_DEVICE(device_store);
+
+    used_memory_ += roundSize;
     handle->dptr = ret;
-    // record the allocation event in the memory profiler
-    profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, false);
   } else {
-    auto&& reuse_pool = reuse_it->second;
-    auto ret = reuse_pool.back();
-    reuse_pool.pop_back();
-    handle->dptr = ret;
+    // Reusing memory
+    handle->dptr = reuse_pool->back();
+    reuse_pool->pop_back();
+  }
+#if MXNET_USE_CUDA
+  SET_GPU_PROFILER(profilerGPU, contextHelper_);
+  if (profilerGPU) {
+    if (reuse_pool)  // roundSize was not calculated
+      roundSize = BucketingStrategy::RoundAllocSizeForBucket(bucket_id);
+
     // record the allocation event in the memory profiler
-    profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, true);
+    profilerGPU->OnAlloc(*handle, roundSize, reuse_pool);
   }
+#endif
 }
 
-void GPUPooledStorageManager::Free(Storage::Handle handle) {
-  // Do nothing if dptr is nullptr. Otherwise, nullptr may be reused
-  // which can cause illegal memory access error.
-  if (handle.dptr == nullptr) return;
 
-  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-  size_t size = RoundAllocSize(handle.size);
-  auto&& reuse_pool = memory_pool_[size];
-  reuse_pool.push_back(handle.dptr);
-}
+/*!
+ * \brief Base class for Rounding Method classes.
+ */
+class RoundHelper {
+ public:
+  virtual size_t get_size(size_t  /*bucket*/) const { return 0; }
+  virtual std::tuple<size_t, size_t> getContainerParam() const {
+     return std::tuple<size_t, size_t>(0, 0);
+  }
 
-void GPUPooledStorageManager::ReleaseAll() {
-  for (auto&& i : memory_pool_) {
-    for (auto&& j : i.second) {
-      Storage::Handle handle;
-      handle.dptr = j;
-      handle.size = i.first;
-      handle.ctx = initial_context_;
-      DirectFreeNoLock(handle);
+ protected:
+  void InitRoundHelper(const char* dev_type) {
+    const auto env_var = env_var_name(dev_type, pool_page_size);
+    page_size_ = dmlc::GetEnv(env_var.c_str(), 4096);
+    if (page_size_ < NDEV) {
+      LOG(FATAL) << env_var << " cannot be set to a value smaller than " << NDEV \
+                    << ". Got " << page_size_ << ".";
     }
   }
-  memory_pool_.clear();
-}
+
+  // page size
+  size_t page_size_ = 0;
+
+ private:
+  // number of devices
+  const size_t NDEV = 32;
+};  // class RoundHelper
 
 /*!
- * \brief Storage manager with a memory pool, with rounded size, on gpu.
+ * \brief Rounding method used by CPU/GPU mem pool.
+ * Round up small allocs to multiple of page_size_ or large_alloc_round_size_
+ */
+class RoundMultiple : protected RoundHelper {
+ protected:
+  void InitRoundHelper(const char *dev_type) {
+    RoundHelper::InitRoundHelper(dev_type);
+    const auto env_var = env_var_name(dev_type, large_alloc_size);
+    large_alloc_round_size_ = dmlc::GetEnv(env_var.c_str(), 2*1024*1024);
+    if (large_alloc_round_size_ <= 0) {
+      LOG(FATAL) << env_var << " cannot be set to a value <= 0, found: "
+                 << large_alloc_round_size_;
+    }
+  }
+
+  size_t RoundAllocSize(size_t size) const {
+    // Round up small allocs to multiple of page_size_ to consolidate the pool lookups
+    size = RoundToMultiple(size, page_size_);
+    // To ensure proper freeing under some driver variants, make sure
+    // large allocs entirely occupy their slabs, which cannot then be
+    // locked by smaller permanent allocations sharing the slab.
+    return  size > large_alloc_round_size_? RoundToMultiple(size, large_alloc_round_size_) : size;
+  }
+  inline size_t get_bucket(size_t size) const                   { return RoundAllocSize(size); }
+  inline size_t RoundAllocSizeForBucket(size_t bucket_id) const { return bucket_id; }
+
+ private:
+  // Round a value 'x' up to the next multiple of 'multiple'
+  inline static size_t RoundToMultiple(size_t x, size_t multiple) {
+    return ((x + multiple - 1) / multiple) * multiple;
+  }
+
+  // size that large allocations should be rounded to, for proper freeing.
+  size_t large_alloc_round_size_;
+};  // class RoundMultiple
+
+/*!
+ * \brief Rounding method used by CPU/GPU mem pool.
  *
- * This GPU mem pool uses a mixture of nearest pow2 (exponential) rounding and
+ * This Rounding method uses a mixture of nearest pow2 (exponential) rounding and
  * nearest multiple (linear) rounding to help alleviate the memory allocation stress
  * in which the default naive exact-size-match pool falls short, such as in variable-length
  * input/output cases like RNN workloads.
  *
  * \param cutoff the cutoff at which rounding is switched from exponential to linear. It's set
- * through MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF environment variable. Must be between 20 (1 MB)
- * and 34 (16 GB).
+ * through MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF / MXNET_CPU_MEM_POOL_ROUND_LINEAR_CUTOFF /
+ * MXNET_CPU_PINNED_MEM_POOL_ROUND_LINEAR_CUTOFF environment variable.
+ * Must be between 20 (1 MB) and 34 (16 GB).
  * Suppose the cutoff is X, the memory size buckets look like this:
  * exp2(0), exp2(1), ..., exp2(X), 2*exp2(X), 3*exp2(X), ...
  */
-class GPUPooledRoundedStorageManager final : public StorageManager {
+class RoundPower2 : public RoundHelper {
  public:
-  /*!
-   * \brief Default constructor.
-   *
-   * \param initial_context context used by this Storage Manager
-   */
-  explicit GPUPooledRoundedStorageManager(Context initial_context) :
-    initial_context_(initial_context) {
-    reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
-    page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
-    cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24);
-    if (page_size_ < 32) {
-      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than 32. " \
-                 << "Got: " << page_size_ << ".";
-    }
-    if (page_size_ != 1ul << common::ilog2ul(page_size_ - 1)) {
-      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE must be a power of 2. Got: " << page_size_ << ".";
+  size_t get_size(size_t bucket) const override {
+    return bucket <= cut_off_? 1ul << bucket : (bucket - cut_off_ + 1) << cut_off_;
+  }
+
+ protected:
+  void InitRoundHelper(const char *dev_type) {
+    RoundHelper::InitRoundHelper(dev_type);
+    const auto log_pager_size = common::ilog2ul(page_size_ - 1);
+    if (page_size_ != 1ul << log_pager_size) {
+      LOG(FATAL) << env_var_name(dev_type, pool_page_size) \
+                 << " must be a power of 2. Got: " << page_size_ << ".";
     }
-    page_size_ = common::ilog2ul(page_size_ - 1);
+    page_size_ = log_pager_size;
+
+    const auto linear_cutoff = env_var_name(dev_type, round_linear_cutoff);
+    cut_off_ = dmlc::GetEnv(linear_cutoff.c_str(), 24);
     if (cut_off_ < 20 || cut_off_ > LOG2_MAX_MEM) {
-      LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \
+      LOG(FATAL) << linear_cutoff << " cannot be set to a value " \
                  << "smaller than 20 or greater than " << LOG2_MAX_MEM << ". Got: " \
                  << cut_off_ << ".";
     }
     if (cut_off_ < page_size_) {
-      LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \
-                 << "smaller than log2 of MXNET_GPU_MEM_POOL_PAGE_SIZE. Got: " \
+      LOG(FATAL) << linear_cutoff << " cannot be set to a value smaller than log2 of " \
+                 << env_var_name(dev_type, pool_page_size) << ". Got: " \
                  << cut_off_ << " vs " << page_size_ << ".";
     }
-    memory_pool_ = std::vector<std::vector<void*>>((1ul << (LOG2_MAX_MEM - cut_off_)) + cut_off_);
-  }
-  /*!
-   * \brief Default destructor.
-   */
-  ~GPUPooledRoundedStorageManager() {
-    ReleaseAll();
   }
 
-  void Alloc(Storage::Handle* handle) override;
-  void Free(Storage::Handle handle) override;
+  inline size_t get_bucket(size_t s) const {
+    const size_t log_size = common::ilog2ul(s - 1);
+    if (log_size > cut_off_)
+      return div_pow2_round_up(s, cut_off_) - 1 + cut_off_;
 
-  void DirectFree(Storage::Handle handle) override {
-    std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-    DirectFreeNoLock(handle);
+    return std::max(log_size, page_size_);
   }
 
-  void ReleaseAll() override;
+  inline size_t RoundAllocSizeForBucket(size_t bucket_id) const { return get_size(bucket_id); }
+  inline size_t RoundAllocSize(size_t size) const { return get_size(get_bucket(size)); }
+  std::tuple<size_t, size_t> getContainerParam() const override {
+    return std::make_tuple((1ul << (LOG2_MAX_MEM - cut_off_)) + cut_off_,
+                       get_bucket(page_size_) - 1);
+  }
 
  private:
-  inline int div_pow2_round_up(size_t s, int divisor_log2) {
+  inline static int div_pow2_round_up(size_t s, int divisor_log2) {
     // (1025, 10) -> 2
     // (2048, 10) -> 2
     // (2049, 10) -> 3
-    size_t result = s >> divisor_log2;
+    const size_t result = s >> divisor_log2;
     return static_cast<int>(result + (s > (result << divisor_log2) ? 1 : 0));
   }
-  inline int get_bucket(size_t s) {
-    int log_size = common::ilog2ul(s - 1);
-    if (log_size > static_cast<int>(cut_off_))
-      return div_pow2_round_up(s, cut_off_) - 1 + cut_off_;
-    else
-      return std::max(log_size, static_cast<int>(page_size_));
-  }
-  inline size_t get_size(int bucket) {
-    if (bucket <= static_cast<int>(cut_off_))
-      return 1ul << bucket;
-    else
-      return (bucket - cut_off_ + 1) * (1ul << cut_off_);
-  }
 
-  void DirectFreeNoLock(Storage::Handle handle) {
-    mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
-    cudaError_t err = cudaFree(handle.dptr);
-    size_t size = get_size(get_bucket(handle.size));
-    // ignore unloading error, as memory has already been recycled
-    if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
-      LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
-    }
-    used_memory_ -= size;
-    profiler::GpuDeviceStorageProfiler::Get()->OnFree(handle);
-  }
-
- private:
   // log2 of maximum page size. 16GB
   const size_t LOG2_MAX_MEM = 34;
-  // address width in bits
-  static const int addr_width = sizeof(size_t) * 8;
-  // used memory
-  size_t used_memory_ = 0;
-  // page size
-  size_t page_size_;
   // log2 of memory size before switching to exponential mode to linear mode
-  size_t cut_off_;
-  // percentage of reserved memory
-  int reserve_;
-  // context used by this Storage Manager
-  const Context initial_context_;
-  // memory pool
-  std::vector<std::vector<void*>> memory_pool_;
-  DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager);
-};  // class GPUPooledRoundedStorageManager
-
-void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
-  // Set dptr to nullptr when handle size is 0.
-  if (handle->size == 0) {
-    handle->dptr = nullptr;
-    return;
+  size_t cut_off_ = 0;
+};  // class RoundPower2
+
+
+/*!
+ * \brief Unordered map based storage container.
+ *  The pointers to the portions of same rounded sizes memory
+ *  allocated on CPU/GPU, are stored in separate vectors.
+ *  These sizes are used as keys for accessing the vectors,
+ *  which are the elements stored in an unordered map.
+ */
+class UnorderedMapContainer {
+ protected:
+  inline void InitContainer(const RoundHelper *p)   {}
+  inline void InsertInCache(size_t key, void *dptr) { memory_pool_[key].push_back(dptr); }
+
+  inline std::vector<void*> *GetMemStorage(size_t key) {
+    auto&& reuse_it = memory_pool_.find(key);
+    return reuse_it != memory_pool_.end() && reuse_it->second.size()? &reuse_it->second : nullptr;
   }
 
-  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-  int bucket = get_bucket(handle->size);
-  size_t size = get_size(bucket);
-  auto&& reuse_pool = memory_pool_[bucket];
-  if (reuse_pool.size() == 0) {
-    mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
-    size_t free, total;
-    cudaMemGetInfo(&free, &total);
-    if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
-      ReleaseAll();
-
-    void* ret = nullptr;
-    cudaError_t e = cudaMalloc(&ret, size);
-    if (e != cudaSuccess) {
-      if (e == cudaErrorMemoryAllocation) {
-        ReleaseAll();
-        e = cudaMalloc(&ret, size);
-        if (e != cudaSuccess && e != cudaErrorCudartUnloading) {
-          LOG(FATAL) << "cudaMalloc retry failed: " << cudaGetErrorString(e);
-        }
-      } else if (e != cudaErrorCudartUnloading) {
-        LOG(FATAL) << "cudaMalloc failed: " << cudaGetErrorString(e);
+  size_t ReleaseAllNoLock(const ContextHelper *contextHelper, const RoundHelper * /*rndHelper*/) {
+    SET_GPU_PROFILER(profilerGPU, contextHelper);
+    size_t released_memory = 0;
+    for (auto&& i : memory_pool_) {
+      for (auto&& j : i.second) {
+        contextHelper->Free(j);
+        GPU_PROFILER_ON_FREE(profilerGPU, j);
       }
+      released_memory += i.first * i.second.size();
+      i.second.clear();
     }
-    used_memory_ += size;
-    handle->dptr = ret;
-    // record the allocation event in the memory profiler
-    profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, false);
-  } else {
-    auto ret = reuse_pool.back();
-    reuse_pool.pop_back();
-    handle->dptr = ret;
-    // record the allocation event in the memory profiler
-    profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, true);
+    memory_pool_.clear();
+    return released_memory;
   }
-}
 
-void GPUPooledRoundedStorageManager::Free(Storage::Handle handle) {
-  // Do nothing if dptr is nullptr. Otherwise, nullptr may be reused
-  // which can cause illegal memory access error.
-  if (handle.dptr == nullptr) return;
+ private:
+  std::unordered_map<size_t, std::vector<void *>> memory_pool_;
+};  // class UnorderedMapContainer
 
-  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-  int bucket = get_bucket(handle.size);
-  auto&& reuse_pool = memory_pool_[bucket];
-  reuse_pool.push_back(handle.dptr);
-}
+/*!
+ * \brief Vector-container based storage container. It should be used ONLY with the RoundPower2.
+ *  The pointers to the portions of same rounded size allocated on
+ *  GPU/CPU/CPU_Pinned memory, are stored in separate vectors.
+ *  The vectors themselves are stored in the vector-container and could
+ *  be accessed by the indices calculated as a functions of rounded size
+ *  (see description for RoundPower2 for more details)
+ */
+class VectorContainer {
+ protected:
+  inline void InitContainer(const RoundHelper *p) {
+    size_t vector_size;
+    std::tie(vector_size, first_bucket_) = p->getContainerParam();
+    memory_pool_ .resize(vector_size);
+  }
+
+  inline void InsertInCache(size_t idx, void *dptr) { memory_pool_[idx].push_back(dptr); }
 
-void GPUPooledRoundedStorageManager::ReleaseAll() {
-  for (size_t i = 0; i < memory_pool_.size(); i++) {
-    int size = get_size(i);
-    for (auto& j : memory_pool_[i]) {
-      Storage::Handle handle;
-      handle.size = size;
-      handle.dptr = j;
-      handle.ctx = initial_context_;
-      DirectFreeNoLock(handle);
+  std::vector<void*> *GetMemStorage(size_t idx) {
+    auto &&reuse_pool = memory_pool_[idx];
+    return reuse_pool.size() ? &reuse_pool : nullptr;
+  }
+
+  size_t ReleaseAllNoLock(const ContextHelper *contextHelper, const RoundHelper *rndHelper) {
+    SET_GPU_PROFILER(profilerGPU, contextHelper);
+    size_t released_memory = 0;
+    for (size_t i = first_bucket_; i < memory_pool_.size(); i++) {
+      if (!memory_pool_[i].size())
+        continue;
+
+      for (auto &j : memory_pool_[i]) {
+        contextHelper->Free(j);
+        GPU_PROFILER_ON_FREE(profilerGPU, j);
+      }
+      released_memory += rndHelper->get_size(i) * memory_pool_[i].size();
+      memory_pool_[i].clear();
     }
-    memory_pool_[i].clear();
+    return released_memory;
   }
-}
 
-#endif  // MXNET_USE_CUDA
+ private:
+  std::vector<std::vector<void*>> memory_pool_;
+  size_t first_bucket_;
+};  // class VectorContainer
+
+// For backward compatibility, define previously used classes via new components.
+// Just in case, if someone uses these classes in other places, besides
+// the storage.cc, where the corresponding changes have already been made.
+typedef PooledStorageManager<RoundMultiple, UnorderedMapContainer> GPUPooledStorageManager;
+typedef PooledStorageManager<RoundPower2, VectorContainer> GPUPooledRoundedStorageManager;
 
 }  // namespace storage
 }  // namespace mxnet
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index c0903d280a63..438a6b872021 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -32,6 +32,7 @@
 #include "../profiler/storage_profiler.h"
 
 namespace mxnet {
+namespace storage {
 
 // consider change storage as a pure abstract class
 class StorageImpl : public Storage {
@@ -39,91 +40,165 @@ class StorageImpl : public Storage {
   void Alloc(Handle* handle) override;
   void Free(Handle handle) override;
   void DirectFree(Handle handle) override;
-  void ReleaseAll(Context ctx) override;
+  void ReleaseAll(Context ctx) override   { storage_manager(ctx)->ReleaseAll(); }
+
   void SharedIncrementRefCount(Handle handle) override;
-  StorageImpl() {}
+  StorageImpl() = default;
   virtual ~StorageImpl() = default;
 
  private:
-  static constexpr size_t kMaxNumberOfDevices = Context::kMaxDevType + 1;
-#if MXNET_USE_CUDA
-  static int num_gpu_device;
-#endif  // MXNET_USE_CUDA
+  std::shared_ptr<StorageManager> storage_manager(const Context &ctx) {
+    auto &&device = storage_managers_.at(ctx.dev_type);
+    std::shared_ptr<StorageManager> manager = device.Get(
+      ctx.real_dev_id(), []() {
+      LOG(FATAL) << "Cannot Free space to a device you have not allocated";
+      return nullptr;
+      });
+    return manager;
+  }
 
+  static constexpr size_t kMaxNumberOfDevices = Context::kMaxDevType + 1;
   // internal storage managers
-  std::array<common::LazyAllocArray<storage::StorageManager>,
-             kMaxNumberOfDevices> storage_managers_;
+  std::array<common::LazyAllocArray<StorageManager>, kMaxNumberOfDevices> storage_managers_;
   profiler::DeviceStorageProfiler profiler_;
 };  // struct Storage::Impl
+
+StorageManager *CreateStorageManager(const Context &ctx, const char *context,
+                                     int num_gpu_device, std::string *pStrategy) {
+  const auto env_var = env_var_name(context, pool_type);
+  const char *type = getenv(env_var.c_str());
+  if (type == nullptr)
+    type = "Naive";   // default pool
+
+  *pStrategy = type;
+  StorageManager *ptr = nullptr;
+  if (*pStrategy == "Round") {
+    ptr = new PooledStorageManager<RoundPower2, VectorContainer>(ctx, num_gpu_device);
+  } else if (*pStrategy == "Naive") {
+    ptr = new PooledStorageManager<RoundMultiple, UnorderedMapContainer>(ctx, num_gpu_device);
+  } else if (*pStrategy == "Unpooled") {
+    if (ctx.dev_type == Context::kCPU || num_gpu_device == 0)
+      ptr = new NaiveStorageManager<CPUDeviceStorage>();
 #if MXNET_USE_CUDA
-int StorageImpl::num_gpu_device = 0;
-#endif  // MXNET_USE_CUDA
+    else if (ctx.dev_type == Context::kGPU)
+      ptr = new NaiveStorageManager<GPUDeviceStorage>();
+    else              // Context::kCPUPinned
+      ptr = new NaiveStorageManager<PinnedMemoryStorage>();
+#endif
+  }
+  return ptr;
+}
+
+void StorageImpl::Alloc(Storage::Handle *handle) {
+  // Set dptr to nullptr when handle size is 0.
+  if (handle->size == 0) {
+    handle->dptr = nullptr;
+    return;
+  }
 
-void StorageImpl::Alloc(Storage::Handle* handle) {
   // space already recycled, ignore request
-  auto&& device = storage_managers_.at(handle->ctx.dev_type);
-  std::shared_ptr<storage::StorageManager> manager = device.Get(
-      handle->ctx.real_dev_id(), [handle]() {
-        storage::StorageManager *ptr = nullptr;
-        switch (handle->ctx.dev_type) {
-          case Context::kCPU: {
-            ptr = new storage::NaiveStorageManager<storage::CPUDeviceStorage>();
-            break;
-          }
-          case Context::kCPUShared: {
-#if !defined(ANDROID) && !defined(__ANDROID__)
-            ptr = new storage::CPUSharedStorageManager();
-#else
-            LOG(FATAL) << "Unimplemented device";
-#endif  // !defined(ANDROID) && !defined(__ANDROID__)
-            break;
-          }
-          case Context::kCPUPinned: {
+  auto &&device = storage_managers_.at(handle->ctx.dev_type);
+  std::shared_ptr<StorageManager> manager = device.Get(
+    handle->ctx.real_dev_id(), [handle]() {
+    const auto dev_type = handle->ctx.dev_type;
+    int num_gpu_device = 0;
+#if MXNET_USE_CUDA
+    switch (dev_type) {
+      case Context::kGPU:
+      case Context::kCPUPinned:
+        if (cudaGetDeviceCount(&num_gpu_device) != cudaSuccess)
+          num_gpu_device = 0;
+      default:
+        break;
+    }
+#endif
+
+    const char *context = nullptr;
+    switch (dev_type) {
+      case Context::kCPU:
+        context = "CPU";
+        break;
+      case Context::kGPU:
 #if MXNET_USE_CUDA
-            num_gpu_device = 0;
-            cudaError_t e = cudaGetDeviceCount(&num_gpu_device);
-            if (e != cudaSuccess) {
-              num_gpu_device = 0;
-            }
-            if (num_gpu_device > 0) {
-              ptr = new storage::NaiveStorageManager<storage::PinnedMemoryStorage>();
-            } else {
-              ptr = new storage::NaiveStorageManager<storage::CPUDeviceStorage>();
-            }
+        context = "GPU";
+        CHECK_GT(num_gpu_device, 0) << "GPU usage requires at least 1 GPU";
 #else
-            ptr = new storage::NaiveStorageManager<storage::CPUDeviceStorage>();
-#endif  // MXNET_USE_CUDA
-            break;
-          }
-          case Context::kGPU: {
+        LOG(FATAL) << "Compile with USE_CUDA=1 to enable GPU usage";
+#endif
+        break;
+      case Context::kCPUPinned:
+        context = "CPU_PINNED";
+        break;
+      case Context::kCPUShared:
+        // We will not generate the log messages for CPUShared
+        // It could be as many of them as the number of "workers".
+#if !defined(ANDROID) && !defined(__ANDROID__)
+        break;   // For Android shared memory is not implemented
+#endif
+      default:
+        LOG(FATAL) << "Unimplemented device " << dev_type;
+    }
+
+    // By default, the Pooled Storage Manager will be used, if it is available
+    int naive_storage_manager = dmlc::GetEnv("MXNET_USE_NAIVE_STORAGE_MANAGER", 0);
+    if (!naive_storage_manager) {
+      // Because, the pooled storage managers are NOT implemented yet for
+      // following dev_type's, we will also use the naive storage managers
+      switch (dev_type) {
+#if MXNET_USE_CUDA
+        case Context::kCPUPinned: if (num_gpu_device > 0)
+                                     break;
+#endif
+        case Context::kCPUShared:  naive_storage_manager = true;
+        default:                   break;
+      }
+    }
+
+    StorageManager *ptr = nullptr;
+    std::string strategy, storage_manager_type;
+    if (naive_storage_manager) {
+      storage_manager_type = "Naive";
+      switch (dev_type) {
 #if MXNET_USE_CUDA
-            CUDA_CALL(cudaGetDeviceCount(&num_gpu_device));
-            CHECK_GT(num_gpu_device, 0) << "GPU usage requires at least 1 GPU";
-
-            const char *type = getenv("MXNET_GPU_MEM_POOL_TYPE");
-            const bool default_pool = (type == nullptr);
-            if (default_pool) type = "Naive";
-            std::string strategy = type;
-
-            if (strategy == "Round") {
-              ptr = new storage::GPUPooledRoundedStorageManager(handle->ctx);
-              LOG(INFO) << "Using GPUPooledRoundedStorageManager.";
-            } else if (strategy == "Naive") {
-              ptr = new storage::GPUPooledStorageManager(handle->ctx);
-            } else if (strategy == "Unpooled") {
-              ptr = new storage::NaiveStorageManager<storage::GPUDeviceStorage>();
-            } else {
-              LOG(FATAL) << "Unknown memory pool strategy specified: " << strategy << ".";
-            }
+        case Context::kGPU:
+              ptr = new NaiveStorageManager<GPUDeviceStorage>();
+              break;
+        case Context::kCPUPinned:
+              if (num_gpu_device > 0) {
+                ptr = new NaiveStorageManager<PinnedMemoryStorage>();
+                break;
+              }
 #else
-            LOG(FATAL) << "Compile with USE_CUDA=1 to enable GPU usage";
-#endif  // MXNET_USE_CUDA
-            break;
-          }
-          default: LOG(FATAL) <<  "Unimplemented device " << handle->ctx.dev_type;
-        }
-        return ptr;
-      });
+        case Context::kCPUPinned:
+#endif
+        case Context::kCPU:
+              ptr = new NaiveStorageManager<CPUDeviceStorage>();
+              break;
+#if !defined(ANDROID) && !defined(__ANDROID__)
+        case Context::kCPUShared:
+              ptr = new CPUSharedStorageManager();
+#endif
+        default: break;
+      }
+    } else {
+      // Some Pooled Storage Manager will be used
+      std::string strategy;
+      ptr = CreateStorageManager(handle->ctx, context, num_gpu_device, &strategy);
+      if (ptr) {
+        if (strategy != "Unpooled")
+          storage_manager_type = "Pooled (" + strategy + ")";
+        else
+          storage_manager_type = "Unpooled";
+      } else {
+        LOG(FATAL) << "Unknown memory pool strategy specified: " << strategy << ".";
+      }
+    }
+
+    if (context)
+      LOG(INFO) << "Using " << storage_manager_type << " StorageManager for " << context;
+
+    return ptr;
+  });
 
   manager->Alloc(handle);
   profiler_.OnAlloc(*handle);
@@ -134,15 +209,7 @@ void StorageImpl::Free(Storage::Handle handle) {
   // been freed or have not been allocated memory yet.
   if (handle.dptr == nullptr) return;
 
-  const Context &ctx = handle.ctx;
-  auto&& device = storage_managers_.at(ctx.dev_type);
-  std::shared_ptr<storage::StorageManager> manager = device.Get(
-      ctx.real_dev_id(), []() {
-        LOG(FATAL) <<  "Cannot Free space to a device you have not allocated";
-        return nullptr;
-      });
-
-  manager->Free(handle);
+  storage_manager(handle.ctx)->Free(handle);
   profiler_.OnFree(handle);
 }
 
@@ -151,28 +218,10 @@ void StorageImpl::DirectFree(Storage::Handle handle) {
   // been freed or have not been allocated memory yet.
   if (handle.dptr == nullptr) return;
 
-  const Context &ctx = handle.ctx;
-  auto&& device = storage_managers_.at(ctx.dev_type);
-  std::shared_ptr<storage::StorageManager> manager = device.Get(
-      ctx.real_dev_id(), []() {
-        LOG(FATAL) <<  "Cannot Free space to a device you have not allocated";
-        return nullptr;
-      });
-
-  manager->DirectFree(handle);
+  storage_manager(handle.ctx)->DirectFree(handle);
   profiler_.OnFree(handle);
 }
 
-void StorageImpl::ReleaseAll(Context ctx) {
-  auto&& device = storage_managers_.at(ctx.dev_type);
-  std::shared_ptr<storage::StorageManager> manager = device.Get(
-    ctx.real_dev_id(), []() {
-    LOG(FATAL) << "Cannot Free space to a device you have not allocated";
-    return nullptr;
-  });
-  manager->ReleaseAll();
-}
-
 void StorageImpl::SharedIncrementRefCount(Storage::Handle handle) {
   CHECK_EQ(handle.ctx.dev_type, Context::kCPUShared);
   auto&& device = storage_managers_.at(Context::kCPUShared);
@@ -180,20 +229,34 @@ void StorageImpl::SharedIncrementRefCount(Storage::Handle handle) {
       LOG(FATAL) << "Cannot increment ref count before allocating any shared memory.";
       return nullptr;
     });
-#if defined(ANDROID) || defined(__ANDROID__)
-  LOG(FATAL) << "Shared memory not implemented on Android";
+#if !defined(ANDROID) && !defined(__ANDROID__)
+  dynamic_cast<CPUSharedStorageManager*>(manager.get())->IncrementRefCount(handle);
 #else
-  dynamic_cast<storage::CPUSharedStorageManager*>(manager.get())->IncrementRefCount(handle);
-#endif  // defined(ANDROID) || defined(__ANDROID__)
+  LOG(FATAL) << "Shared memory not implemented on Android";
+#endif  // !defined(ANDROID) && !defined(__ANDROID__)
+}
+
+const std::string env_var_name(const char* dev_type, env_var_type type) {
+  static const std::array<std::string, 5> name = {
+                        "MEM_POOL_TYPE",
+                        "POOL_PAGE_SIZE",
+                        "MEM_LARGE_ALLOC_ROUND_SIZE",
+                        "MEM_POOL_ROUND_LINEAR_CUTOFF",
+                        "MEM_POOL_RESERVE",
+                        };
+
+  return std::string("MXNET_") + dev_type + "_" + name[type];
 }
 
+}  // namespace storage
+
 std::shared_ptr<Storage> Storage::_GetSharedRef() {
 #ifdef __MXNET_JS__
   // dummy code needed for emscripten code to pass
   // do not know why, the new will be NULLPTR
   static int *q = new int();
 #endif
-  static std::shared_ptr<Storage> inst(new StorageImpl());
+  static std::shared_ptr<Storage> inst(new storage::StorageImpl());
   return inst;
 }
 
diff --git a/src/storage/storage_manager.h b/src/storage/storage_manager.h
index 13be16ebe70f..48077aad548f 100644
--- a/src/storage/storage_manager.h
+++ b/src/storage/storage_manager.h
@@ -26,8 +26,8 @@
 #ifndef MXNET_STORAGE_STORAGE_MANAGER_H_
 #define MXNET_STORAGE_STORAGE_MANAGER_H_
 
+#include "./storage_manager_helpers.h"
 #include <mxnet/storage.h>
-#include <cstddef>
 
 namespace mxnet {
 namespace storage {
diff --git a/src/storage/storage_manager_helpers.h b/src/storage/storage_manager_helpers.h
new file mode 100644
index 000000000000..e144af2ab9a3
--- /dev/null
+++ b/src/storage/storage_manager_helpers.h
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_STORAGE_STORAGE_MANAGER_HELPERS_H_
+#define MXNET_STORAGE_STORAGE_MANAGER_HELPERS_H_
+
+#if MXNET_USE_CUDA
+#include <cuda_runtime.h>
+#include "../common/cuda_utils.h"
+#include "../profiler/storage_profiler.h"
+typedef  mxnet::common::cuda::DeviceStore CudaDeviceStore;
+#endif  // MXNET_USE_CUDA
+
+#ifndef _WIN32
+#if __APPLE__
+#include <mach/vm_statistics.h>
+#include <mach/mach_types.h>
+#include <mach/mach_init.h>
+#include <mach/mach_host.h>
+#else
+#include <sys/sysinfo.h>
+#endif  // __APPLE__
+#else
+#include <Windows.h>
+#include <process.h>
+#endif  // _WIN32
+
+#include <tuple>
+
+namespace mxnet {
+namespace storage {
+
+/*!
+ * \brief Abstract class, which contains context specific methods used by PooledStorageManager.
+ */
+class ContextHelper {
+ public:
+  virtual ~ContextHelper()                              {}
+
+  inline void set_initilal_context(const Context &ctx)  { initilal_context_ = ctx; }
+  inline const Context &initilal_context() const        { return initilal_context_; }
+  virtual std::tuple<size_t, size_t> getMemoryInfo() const = 0;
+  virtual int Malloc(void **ppNtr, size_t size) const = 0;
+  virtual void Free(void *dptr) const = 0;
+  inline size_t freeMemorySize() const                  { return std::get<0>(getMemoryInfo()); }
+
+#if MXNET_USE_CUDA
+  virtual bool contextGPU() const                       { return false; }
+  virtual const CudaDeviceStore *SetCurrentDevice(const Context &/*ctx*/) const { return nullptr; }
+#endif
+
+ private:
+  // context used by this Storage Manager
+  Context initilal_context_;
+};
+
+/*!
+ * \brief Class, which contains the CPU specific methods used by PooledStorageManager.
+ */
+class ContextHelperCPU : public ContextHelper {
+ public:
+  std::tuple<size_t, size_t> getMemoryInfo() const override {
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+    MEMORYSTATUSEX status;
+    status.dwLength = sizeof(status);
+    GlobalMemoryStatusEx(&status);
+    return std::make_tuple(status.ullAvailPhys, status.ullTotalPhys);
+#elif __APPLE__
+    vm_size_t page_size;
+    vm_statistics64_data_t vm_stats;
+
+    mach_port_t mach_port = mach_host_self();
+    mach_msg_type_number_t count = sizeof(vm_stats) / sizeof(natural_t);
+    if (KERN_SUCCESS != host_page_size(mach_port, &page_size) ||
+        KERN_SUCCESS != host_statistics64(mach_port, HOST_VM_INFO,
+                                          (host_info64_t)&vm_stats, &count)) {
+      LOG(FATAL) << "Cannot get memory information";
+    }
+
+    const size_t free_memory = (uint64_t)vm_stats.free_count * (uint64_t)page_size;
+
+    const size_t used_memory = ((uint64_t)vm_stats.active_count +
+                                (uint64_t)vm_stats.inactive_count +
+                                (uint64_t)vm_stats.wire_count) *  (uint64_t)page_size;
+
+    return std::make_tuple(free_memory, used_memory + free_memory);
+#else   // Linux
+    struct sysinfo info = {};
+    if (sysinfo(&info) < 0)
+      LOG(FATAL) << "Error: sysinfo failed";
+
+    return std::make_tuple(info.freeram, info.totalram);
+#endif
+  }
+
+  int Malloc(void **ppNtr, size_t size) const override {
+    return (*ppNtr = std::malloc(size))? 0 : -1;
+  }
+
+  void Free(void *dptr) const override                  { std::free(dptr); }
+};
+
+#if MXNET_USE_CUDA
+/*!
+ * \brief Class, which contains the GPU specific methods used by PooledStorageManager.
+ */
+class ContextHelperGPU : public ContextHelper {
+ public:
+  std::tuple<size_t, size_t> getMemoryInfo() const override {
+    size_t free, total;
+    const auto cuda_status = cudaMemGetInfo(&free, &total);
+    if (cudaSuccess != cuda_status)
+      LOG(FATAL) << "Error: cudaMemGetInfo fails " << cudaGetErrorString(cuda_status);
+    return std::make_tuple(free, total);
+  }
+
+  bool contextGPU() const override                      { return true; }
+  int Malloc(void **ppPntr, size_t size) const override { return cudaMalloc(ppPntr, size); }
+  void Free(void *dptr) const override                  { CUDA_CALL(cudaFree(dptr)); }
+
+  const CudaDeviceStore *SetCurrentDevice(const Context &ctx) const override {
+    return new CudaDeviceStore(ctx.real_dev_id(), true);
+  }
+};
+
+/*!
+ * \brief Class, which contains the CPU_Pinned specific methods used by PooledStorageManager.
+ * When MxNet is compiled for MXNET_USE_CUDA=0, this class coincides with ContextHelperCPU
+ */
+class ContextHelperPinned : public ContextHelperGPU {
+ public:
+  int Malloc(void **ppPntr, size_t size) const override {
+    // make the memory available across all devices
+    return cudaHostAlloc(ppPntr, size, cudaHostAllocPortable);
+  }
+  void Free(void *dptr) const override                  { CUDA_CALL(cudaFreeHost(dptr)); }
+};
+
+#else
+typedef  ContextHelperCPU ContextHelperPinned;
+#endif
+
+}  // namespace storage
+}  // namespace mxnet
+
+#endif  // MXNET_STORAGE_STORAGE_MANAGER_HELPERS_H_
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index b1571cc4ae35..b68d03f697fc 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -364,7 +364,7 @@ def test_multi_worker_dataloader_release_pool():
         del the_iter
         del D
 
-
+@with_seed()
 def test_dataloader_context():
     X = np.random.uniform(size=(10, 20))
     dataset = gluon.data.ArrayDataset(X)
@@ -381,11 +381,14 @@ def test_dataloader_context():
     for _, x in enumerate(loader2):
         assert x.context == context.cpu_pinned(default_dev_id)
 
-    # use pinned memory with custom device id
-    loader3 = gluon.data.DataLoader(dataset, 8, pin_memory=True,
-                                    pin_device_id=custom_dev_id)
-    for _, x in enumerate(loader3):
-        assert x.context == context.cpu_pinned(custom_dev_id)
+    if mx.context.num_gpus() <= 1:
+        print('Bypassing custom_dev_id pinned mem test on system with < 2 gpus.')
+    else:
+        # use pinned memory with custom device id
+        loader3 = gluon.data.DataLoader(dataset, 8, pin_memory=True,
+                                        pin_device_id=custom_dev_id)
+        for _, x in enumerate(loader3):
+            assert x.context == context.cpu_pinned(custom_dev_id)
 
 def batchify(a):
     return a