PaddlePaddle · From00 · Jul 4, 2024 · May 21, 2024 · May 22, 2024 · May 30, 2024
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h
@@ -53,13 +53,24 @@ inline bool IsInterpretercoreFastGCEnabled() {
   // When using cuda graph, fast GC must be used. Because
   // `EventQuery` method in event GC cannot be used in
   // cuda graph.
+  PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
+                                .IsStreamSafeCUDAAllocatorUsed() == true &&
+                        memory::allocation::AllocatorFacade::Instance()
+                                .IsCUDAMallocAsyncAllocatorUsed() == true,
+                    false,
+                    platform::errors::InvalidArgument(
+                        "StreamSafeAllocator and AsyncAllocator shouldn't be "
+                        "True together."));
   PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
                                 .IsStreamSafeCUDAAllocatorUsed() == false &&
+                        memory::allocation::AllocatorFacade::Instance()
+                                .IsCUDAMallocAsyncAllocatorUsed() == false &&
                         FLAGS_new_executor_use_cuda_graph,
                     false,
                     platform::errors::InvalidArgument(
                         "When FLAGS_new_executor_use_cuda_graph is true, "
-                        "IsStreamSafeCUDAAllocatorUsed must be true, but "
+                        "Either IsStreamSafeCUDAAllocatorUsed or "
+                        "IsCUDAMallocAsyncAllocatorUsed must be true, but "
                         "got false."));
   return (memory::allocation::AllocatorFacade::Instance()
               .IsStreamSafeCUDAAllocatorUsed() &&

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -264,6 +264,11 @@ class AllocatorFacadePrivate {
         // application, treating it separately can avoid lots of overhead of
         // acquiring default stream and applying read-write lock.
         if (FLAGS_use_cuda_malloc_async_allocator) {
+          PADDLE_ENFORCE_EQ(FLAGS_use_cuda_managed_memory,
+                            false,
+                            platform::errors::InvalidArgument(
+                                "Async allocator cannot be used with CUDA "
+                                "managed memory."));
           WrapCUDAMallocAsyncAllocatorForDefault();
           is_cuda_malloc_async_allocator_used_ = true;
         } else {
@@ -871,6 +876,11 @@ class AllocatorFacadePrivate {
             "the allocator strategy %d is unsupported for multi-stream",
             static_cast<int>(strategy_)));
     if (FLAGS_use_cuda_malloc_async_allocator) {
+      PADDLE_ENFORCE_EQ(
+          FLAGS_use_cuda_managed_memory,
+          false,
+          platform::errors::InvalidArgument(
+              "Async allocator cannot be used with CUDA managed memory."));
       VLOG(8) << "[CUDAMallocAsyncAllocator] Init CUDA allocator for stream "
               << stream << " in place " << p;
       InitCUDAMallocAsyncAllocator(p, stream);

diff --git a/test/cpp/fluid/memory/CMakeLists.txt b/test/cpp/fluid/memory/CMakeLists.txt
@@ -116,8 +116,9 @@ if(WITH_TESTING AND TEST cuda_managed_memory_test)
     cuda_managed_memory_test
     PROPERTIES
       ENVIRONMENT
-      "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth"
-      TIMEOUT 50)
+      "FLAGS_use_cuda_managed_memory=true;FLAGS_use_cuda_malloc_async_allocator=false;FLAGS_allocator_strategy=auto_growth"
+      TIMEOUT
+      50)
 endif()
 
 if(WITH_GPU AND WITH_TESTING)

diff --git a/test/cpp/fluid/memory/stream_safe_cuda_alloc_test.cu b/test/cpp/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -33,6 +33,14 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#define RETURN_IF_NOT_ENABLED                            \
+  {                                                      \
+    if (!memory::allocation::AllocatorFacade::Instance() \
+             .IsStreamSafeCUDAAllocatorUsed()) {         \
+      return;                                            \
+    }                                                    \
+  }
+
 namespace paddle {
 namespace memory {
 
@@ -54,6 +62,8 @@ void CheckMemLeak(const platform::CUDAPlace &place) {
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
+  RETURN_IF_NOT_ENABLED;
+
   platform::CUDAPlace place = platform::CUDAPlace();
   size_t alloc_size = 256;
 
@@ -81,6 +91,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
+  RETURN_IF_NOT_ENABLED;
+
   platform::CUDAPlace place = platform::CUDAPlace();
   size_t alloc_size = 256;
 
@@ -104,6 +116,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) {
+  RETURN_IF_NOT_ENABLED;
+
   auto &instance = allocation::AllocatorFacade::Instance();
   platform::CUDAPlace place = platform::CUDAPlace();
   const std::shared_ptr<Allocator> allocator_implicit_stream =
@@ -118,6 +132,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) {
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) {
+  RETURN_IF_NOT_ENABLED;
+
   platform::CUDAPlace place = platform::CUDAPlace();
   std::shared_ptr<Allocation> zero_size_allocation = AllocShared(place, 0);
   EXPECT_EQ(zero_size_allocation->ptr(), nullptr);
@@ -139,6 +155,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) {
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
+  RETURN_IF_NOT_ENABLED;
+
   platform::CUDAPlace place = platform::CUDAPlace();
   size_t alloc_size = 256;
 
@@ -176,6 +194,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
 }
 
 TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
+  RETURN_IF_NOT_ENABLED;
+
   platform::CUDAPlace place = platform::CUDAPlace();
   gpuStream_t stream1, stream2;
 #ifdef PADDLE_WITH_CUDA
@@ -403,17 +423,23 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 };
 
 TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
+  RETURN_IF_NOT_ENABLED;
+
   MultiStreamRun();
   CheckResult();
 }
 
 TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
+  RETURN_IF_NOT_ENABLED;
+
   MultiThreadMultiStreamRun();
   CheckResult();
 }
 
 #if (defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11000))
 TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) {
+  RETURN_IF_NOT_ENABLED;
+
   MultiStreamRun();
   CUDAGraphRun();
   CheckResult();

diff --git a/test/legacy_test/test_auto_growth_allocator_gpu.py b/test/legacy_test/test_auto_growth_allocator_gpu.py
@@ -25,6 +25,8 @@
         {
             'FLAGS_allocator_strategy': 'auto_growth',
             'FLAGS_auto_growth_chunk_size_in_mb': 10,
+            # Async allocator does not support auto growth allocator.
+            'FLAGS_use_cuda_malloc_async_allocator': 0,
         }
     )