PaddlePaddle · From00 · Feb 8, 2022 · Jan 20, 2022 · Jan 26, 2022 · Jan 27, 2022
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
@@ -20,18 +20,29 @@ if (WITH_GPU)
     nv_test(stream_safe_cuda_alloc_test
             SRCS stream_safe_cuda_alloc_test.cu
             DEPS malloc cuda_graph_with_memory_pool)
+    nv_test(cuda_managed_memory_test
+            SRCS cuda_managed_memory_test.cu
+            DEPS malloc gpu_info place)
 
     if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
         set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES 
-                             ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;
-                                          FLAGS_allocator_strategy=auto_growth")
-    endif()  
+                             ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;FLAGS_allocator_strategy=auto_growth")
+    endif()
 endif()
 
 if (WITH_ROCM)
     hip_test(malloc_test
             SRCS malloc_test.cu
             DEPS device_context malloc)
+    hip_test(cuda_managed_memory_test
+            SRCS cuda_managed_memory_test.cu
+            DEPS malloc gpu_info place)
+endif()
+
+if(WITH_TESTING AND TEST cuda_managed_memory_test)
+set_tests_properties(cuda_managed_memory_test PROPERTIES
+                     ENVIRONMENT "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth"
+                     TIMEOUT 50)
 endif()
 
 if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")       

diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -15,6 +15,7 @@ endif()
 
 if (WITH_GPU)
   nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+  nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
   nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
   nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
   nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
@@ -27,6 +28,7 @@ endif()
 
 if (WITH_ROCM)
   hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+  hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
   hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
   hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
   hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
@@ -42,7 +44,7 @@ endif()
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 
 if (WITH_GPU OR WITH_ROCM)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context)
+    set(AllocatorFacadeDeps gpu_info cuda_allocator cuda_managed_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context)
     if(CUDA_VERSION GREATER_EQUAL 10.2)
       list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
     endif()

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -28,6 +28,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include <shared_mutex>
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
+#include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
@@ -80,6 +81,11 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
 PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
                             "Enable StreamSafeCUDAAllocator");
 
+PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory, false,
+                            "Whether to use CUDAManagedAllocator to allocate "
+                            "managed memory, only available for auto_growth "
+                            "strategy");
+
 DECLARE_string(allocator_strategy);
 
 namespace paddle {
@@ -436,6 +442,37 @@ class AllocatorFacadePrivate {
         std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
   }
 
+  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+
+  // Create a new CUDAAllocator or CUDAManagedAllocator for the given device
+  std::shared_ptr<Allocator> CreateCUDAAllocator(platform::CUDAPlace p) {
+    if (FLAGS_use_cuda_managed_memory) {
+      PADDLE_ENFORCE_EQ(
+          strategy_, AllocatorStrategy::kAutoGrowth,
+          platform::errors::InvalidArgument(
+              "CUDA managed memory is only implemented for auto_growth "
+              "strategy, not support %s strategy.\n"
+              "Please use auto_growth strategy by command `export "
+              "FLAGS_allocator_strategy=\"auto_growth\"`, or disable managed "
+              "memory by command `export FLAGS_use_cuda_managed_memory=false`",
+              FLAGS_allocator_strategy));
+
+      if (!platform::IsGPUManagedMemorySupported(p.device)) {
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Failed to create CUDAManagedAllocator on GPU %d.\n\n"
+            "You have enabled CUDA managed memory, but the gpu device does not "
+            "support allocating managed memory.\n"
+            "If you don't actually need to use managed memory, please disable "
+            "it with command `export FLAGS_use_cuda_managed_memory=false`.\n"
+            "Or you must use the gpu device that supports managed memory."));
+      }
+      return std::make_shared<CUDAManagedAllocator>(p);
+    }
+    return std::make_shared<CUDAAllocator>(p);
+  }
+
   void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
     PADDLE_ENFORCE_EQ(
         strategy_, AllocatorStrategy::kAutoGrowth,
@@ -452,13 +489,9 @@ class AllocatorFacadePrivate {
     }
   }
 
-  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
-    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
-  }
-
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
 #if defined(PADDLE_WITH_HIP)
-    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    auto cuda_allocator = CreateCUDAAllocator(p);
     cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
 #endif
@@ -485,14 +518,14 @@ class AllocatorFacadePrivate {
           std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
               cuda_allocator, platform::GpuMinChunkSize(), p);
     } else {
-      auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+      auto cuda_allocator = CreateCUDAAllocator(p);
       cuda_allocators_[p][stream] =
           std::make_shared<AutoGrowthBestFitAllocator>(
               cuda_allocator, platform::GpuMinChunkSize(),
               allow_free_idle_chunk_);
     }
 #else
-    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    auto cuda_allocator = CreateCUDAAllocator(p);
     auto alignment = platform::GpuMinChunkSize();
     bool need_addr_align = true;
     // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
@@ -535,7 +568,7 @@ class AllocatorFacadePrivate {
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                    bool allow_free_idle_chunk) {
 #if defined(PADDLE_WITH_HIP)
-    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    auto cuda_allocator = CreateCUDAAllocator(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
 #endif
@@ -562,13 +595,13 @@ class AllocatorFacadePrivate {
           std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
               cuda_allocator, platform::GpuMinChunkSize(), p);
     } else {
-      auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+      auto cuda_allocator = CreateCUDAAllocator(p);
       allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
           cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
     }
 
 #else
-    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    auto cuda_allocator = CreateCUDAAllocator(p);
     auto alignment = platform::GpuMinChunkSize();
     bool need_addr_align = true;
     // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
@@ -690,7 +723,7 @@ class AllocatorFacadePrivate {
     int device_count = platform::GetGPUDeviceCount();
     for (int i = 0; i < device_count; ++i) {
       platform::CUDAPlace p(i);
-      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
+      system_allocators_[p] = CreateCUDAAllocator(p);
     }
 #endif
 #ifdef PADDLE_WITH_MLU

diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -67,16 +67,24 @@ pten::Allocation* CUDAAllocator::AllocateImpl(size_t size) {
         limit_size, limit_size);
   }
 
+  std::string managed_memory_msg;
+  if (platform::IsGPUManagedMemoryOversubscriptionSupported(place_.device)) {
+    managed_memory_msg = string::Sprintf(
+        "If the above ways do not solve the out of memory problem, you can try "
+        "to use CUDA managed memory. The command is `export "
+        "FLAGS_use_cuda_managed_memory=false`.");
+  }
+
   PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
       "\n\nOut of memory error on GPU %d. "
       "Cannot allocate %s memory on GPU %d, %s memory has been allocated and "
       "available memory is only %s.\n\n"
       "Please check whether there is any other process using GPU %d.\n"
       "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
-      "2. If no, please decrease the batch size of your model. %s\n\n",
+      "2. If no, please decrease the batch size of your model. %s\n%s\n",
       place_.device, string::HumanReadableSize(size), place_.device,
       string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
-      place_.device, err_msg));
+      place_.device, err_msg, managed_memory_msg));
 }
 
 }  // namespace allocation

diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#include <string>
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+bool CUDAManagedAllocator::IsAllocThreadSafe() const { return true; }
+
+void CUDAManagedAllocator::FreeImpl(pten::Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      allocation->place(), place_,
+      platform::errors::PermissionDenied(
+          "GPU memory is freed in incorrect device. This may be a bug"));
+  platform::RecordedGpuFree(allocation->ptr(), allocation->size(),
+                            place_.device);
+  delete allocation;
+}
+
+pten::Allocation* CUDAManagedAllocator::AllocateImpl(size_t size) {
+  std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); });
+
+  int dev_id = place_.device;
+  void* ptr;
+  auto result = platform::RecordedGpuMalloc(&ptr, size, dev_id,
+                                            /* malloc_managed_memory = */ true);
+  if (LIKELY(result == gpuSuccess)) {
+    return new Allocation(ptr, size, platform::Place(place_));
+  }
+
+  uint64_t limit_size = platform::RecordedGpuLimitSize(dev_id);
+  uint64_t malloc_size = platform::RecordedGpuMallocSize(dev_id);
+  bool is_limited =
+      platform::IsGpuMallocRecorded(dev_id) && malloc_size + size > limit_size;
+
+  std::string err_msg;
+  if (UNLIKELY(is_limited)) {
+    int64_t limit_size_mb = limit_size >> 20;
+    err_msg = string::Sprintf(
+        "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
+        "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
+        "GPU memory usage is limited to %d MB.\n"
+        "   The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+        limit_size_mb, limit_size_mb);
+  }
+
+  PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+      "\n\nOut of memory error on GPU %d. "
+      "Cannot allocate %s CUDA managed memory on GPU %d, %s memory has been "
+      "allocated.\n\n"
+      "Please check whether there is any other process using GPU %d.\n"
+      "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
+      "2. If no, please decrease the batch size of your model. %s\n\n",
+      dev_id, string::HumanReadableSize(size), dev_id,
+      string::HumanReadableSize(malloc_size), dev_id, err_msg));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.h b/paddle/fluid/memory/allocation/cuda_managed_allocator.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class CUDAManagedAllocator : public Allocator {
+ public:
+  explicit CUDAManagedAllocator(const platform::CUDAPlace& place)
+      : place_(place) {}
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::CUDAPlace place_;
+  std::once_flag once_flag_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle