diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h index 9c9b40631eaa93..6208130a67ca75 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h +++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h @@ -53,13 +53,24 @@ inline bool IsInterpretercoreFastGCEnabled() { // When using cuda graph, fast GC must be used. Because // `EventQuery` method in event GC cannot be used in // cuda graph. + PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance() + .IsStreamSafeCUDAAllocatorUsed() == true && + memory::allocation::AllocatorFacade::Instance() + .IsCUDAMallocAsyncAllocatorUsed() == true, + false, + platform::errors::InvalidArgument( + "StreamSafeAllocator and AsyncAllocator shouldn't be " + "True together.")); PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance() .IsStreamSafeCUDAAllocatorUsed() == false && + memory::allocation::AllocatorFacade::Instance() + .IsCUDAMallocAsyncAllocatorUsed() == false && FLAGS_new_executor_use_cuda_graph, false, platform::errors::InvalidArgument( "When FLAGS_new_executor_use_cuda_graph is true, " - "IsStreamSafeCUDAAllocatorUsed must be true, but " + "Either IsStreamSafeCUDAAllocatorUsed or " + "IsCUDAMallocAsyncAllocatorUsed must be true, but " "got false.")); return (memory::allocation::AllocatorFacade::Instance() .IsStreamSafeCUDAAllocatorUsed() && diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index eef6c1a1e8c4ac..b81bfd0400d99f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -264,6 +264,11 @@ class AllocatorFacadePrivate { // application, treating it separately can avoid lots of overhead of // acquiring default stream and applying read-write lock. if (FLAGS_use_cuda_malloc_async_allocator) { + PADDLE_ENFORCE_EQ(FLAGS_use_cuda_managed_memory, + false, + platform::errors::InvalidArgument( + "Async allocator cannot be used with CUDA " + "managed memory.")); WrapCUDAMallocAsyncAllocatorForDefault(); is_cuda_malloc_async_allocator_used_ = true; } else { @@ -871,6 +876,11 @@ class AllocatorFacadePrivate { "the allocator strategy %d is unsupported for multi-stream", static_cast(strategy_))); if (FLAGS_use_cuda_malloc_async_allocator) { + PADDLE_ENFORCE_EQ( + FLAGS_use_cuda_managed_memory, + false, + platform::errors::InvalidArgument( + "Async allocator cannot be used with CUDA managed memory.")); VLOG(8) << "[CUDAMallocAsyncAllocator] Init CUDA allocator for stream " << stream << " in place " << p; InitCUDAMallocAsyncAllocator(p, stream); diff --git a/test/cpp/fluid/memory/CMakeLists.txt b/test/cpp/fluid/memory/CMakeLists.txt index 5bb36f73982287..a7c2e6df4411c6 100644 --- a/test/cpp/fluid/memory/CMakeLists.txt +++ b/test/cpp/fluid/memory/CMakeLists.txt @@ -116,8 +116,9 @@ if(WITH_TESTING AND TEST cuda_managed_memory_test) cuda_managed_memory_test PROPERTIES ENVIRONMENT - "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth" - TIMEOUT 50) + "FLAGS_use_cuda_managed_memory=true;FLAGS_use_cuda_malloc_async_allocator=false;FLAGS_allocator_strategy=auto_growth" + TIMEOUT + 50) endif() if(WITH_GPU AND WITH_TESTING) diff --git a/test/cpp/fluid/memory/stream_safe_cuda_alloc_test.cu b/test/cpp/fluid/memory/stream_safe_cuda_alloc_test.cu index b0bebf5202eee2..91e896c803bec0 100644 --- a/test/cpp/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/test/cpp/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -33,6 +33,14 @@ #include #endif +#define RETURN_IF_NOT_ENABLED \ + { \ + if (!memory::allocation::AllocatorFacade::Instance() \ + .IsStreamSafeCUDAAllocatorUsed()) { \ + return; \ + } \ + } + namespace paddle { namespace memory { @@ -54,6 +62,8 @@ void CheckMemLeak(const platform::CUDAPlace &place) { } TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { + RETURN_IF_NOT_ENABLED; + platform::CUDAPlace place = platform::CUDAPlace(); size_t alloc_size = 256; @@ -81,6 +91,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { } TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) { + RETURN_IF_NOT_ENABLED; + platform::CUDAPlace place = platform::CUDAPlace(); size_t alloc_size = 256; @@ -104,6 +116,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) { } TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) { + RETURN_IF_NOT_ENABLED; + auto &instance = allocation::AllocatorFacade::Instance(); platform::CUDAPlace place = platform::CUDAPlace(); const std::shared_ptr allocator_implicit_stream = @@ -118,6 +132,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) { } TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) { + RETURN_IF_NOT_ENABLED; + platform::CUDAPlace place = platform::CUDAPlace(); std::shared_ptr zero_size_allocation = AllocShared(place, 0); EXPECT_EQ(zero_size_allocation->ptr(), nullptr); @@ -139,6 +155,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) { } TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { + RETURN_IF_NOT_ENABLED; + platform::CUDAPlace place = platform::CUDAPlace(); size_t alloc_size = 256; @@ -176,6 +194,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { } TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { + RETURN_IF_NOT_ENABLED; + platform::CUDAPlace place = platform::CUDAPlace(); gpuStream_t stream1, stream2; #ifdef PADDLE_WITH_CUDA @@ -403,17 +423,23 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { }; TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) { + RETURN_IF_NOT_ENABLED; + MultiStreamRun(); CheckResult(); } TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) { + RETURN_IF_NOT_ENABLED; + MultiThreadMultiStreamRun(); CheckResult(); } #if (defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11000)) TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) { + RETURN_IF_NOT_ENABLED; + MultiStreamRun(); CUDAGraphRun(); CheckResult(); diff --git a/test/legacy_test/test_auto_growth_allocator_gpu.py b/test/legacy_test/test_auto_growth_allocator_gpu.py index 3ac11c1baf86fb..c20c825032d6ac 100644 --- a/test/legacy_test/test_auto_growth_allocator_gpu.py +++ b/test/legacy_test/test_auto_growth_allocator_gpu.py @@ -25,6 +25,8 @@ { 'FLAGS_allocator_strategy': 'auto_growth', 'FLAGS_auto_growth_chunk_size_in_mb': 10, + # Async allocator does not support auto growth allocator. + 'FLAGS_use_cuda_malloc_async_allocator': 0, } )