Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,24 @@ inline bool IsInterpretercoreFastGCEnabled() {
// When using cuda graph, fast GC must be used. Because
// `EventQuery` method in event GC cannot be used in
// cuda graph.
PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() == true &&
memory::allocation::AllocatorFacade::Instance()
.IsCUDAMallocAsyncAllocatorUsed() == true,
false,
platform::errors::InvalidArgument(
"StreamSafeAllocator and AsyncAllocator shouldn't be "
"True together."));
PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() == false &&
memory::allocation::AllocatorFacade::Instance()
.IsCUDAMallocAsyncAllocatorUsed() == false &&
FLAGS_new_executor_use_cuda_graph,
false,
platform::errors::InvalidArgument(
"When FLAGS_new_executor_use_cuda_graph is true, "
"IsStreamSafeCUDAAllocatorUsed must be true, but "
"Either IsStreamSafeCUDAAllocatorUsed or "
"IsCUDAMallocAsyncAllocatorUsed must be true, but "
"got false."));
return (memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() &&
Expand Down
10 changes: 10 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,11 @@ class AllocatorFacadePrivate {
// application, treating it separately can avoid lots of overhead of
// acquiring default stream and applying read-write lock.
if (FLAGS_use_cuda_malloc_async_allocator) {
PADDLE_ENFORCE_EQ(FLAGS_use_cuda_managed_memory,
false,
platform::errors::InvalidArgument(
"Async allocator cannot be used with CUDA "
"managed memory."));
WrapCUDAMallocAsyncAllocatorForDefault();
is_cuda_malloc_async_allocator_used_ = true;
} else {
Expand Down Expand Up @@ -871,6 +876,11 @@ class AllocatorFacadePrivate {
"the allocator strategy %d is unsupported for multi-stream",
static_cast<int>(strategy_)));
if (FLAGS_use_cuda_malloc_async_allocator) {
PADDLE_ENFORCE_EQ(
FLAGS_use_cuda_managed_memory,
false,
platform::errors::InvalidArgument(
"Async allocator cannot be used with CUDA managed memory."));
VLOG(8) << "[CUDAMallocAsyncAllocator] Init CUDA allocator for stream "
<< stream << " in place " << p;
InitCUDAMallocAsyncAllocator(p, stream);
Expand Down
5 changes: 3 additions & 2 deletions test/cpp/fluid/memory/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,9 @@ if(WITH_TESTING AND TEST cuda_managed_memory_test)
cuda_managed_memory_test
PROPERTIES
ENVIRONMENT
"FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth"
TIMEOUT 50)
"FLAGS_use_cuda_managed_memory=true;FLAGS_use_cuda_malloc_async_allocator=false;FLAGS_allocator_strategy=auto_growth"
TIMEOUT
50)
endif()

if(WITH_GPU AND WITH_TESTING)
Expand Down
26 changes: 26 additions & 0 deletions test/cpp/fluid/memory/stream_safe_cuda_alloc_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@
#include <hip/hip_runtime.h>
#endif

#define RETURN_IF_NOT_ENABLED \
{ \
if (!memory::allocation::AllocatorFacade::Instance() \
.IsStreamSafeCUDAAllocatorUsed()) { \
return; \
} \
}

namespace paddle {
namespace memory {

Expand All @@ -54,6 +62,8 @@ void CheckMemLeak(const platform::CUDAPlace &place) {
}

TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
RETURN_IF_NOT_ENABLED;

platform::CUDAPlace place = platform::CUDAPlace();
size_t alloc_size = 256;

Expand Down Expand Up @@ -81,6 +91,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
}

TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
RETURN_IF_NOT_ENABLED;

platform::CUDAPlace place = platform::CUDAPlace();
size_t alloc_size = 256;

Expand All @@ -104,6 +116,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
}

TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) {
RETURN_IF_NOT_ENABLED;

auto &instance = allocation::AllocatorFacade::Instance();
platform::CUDAPlace place = platform::CUDAPlace();
const std::shared_ptr<Allocator> allocator_implicit_stream =
Expand All @@ -118,6 +132,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) {
}

TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) {
RETURN_IF_NOT_ENABLED;

platform::CUDAPlace place = platform::CUDAPlace();
std::shared_ptr<Allocation> zero_size_allocation = AllocShared(place, 0);
EXPECT_EQ(zero_size_allocation->ptr(), nullptr);
Expand All @@ -139,6 +155,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) {
}

TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
RETURN_IF_NOT_ENABLED;

platform::CUDAPlace place = platform::CUDAPlace();
size_t alloc_size = 256;

Expand Down Expand Up @@ -176,6 +194,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
}

TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
RETURN_IF_NOT_ENABLED;

platform::CUDAPlace place = platform::CUDAPlace();
gpuStream_t stream1, stream2;
#ifdef PADDLE_WITH_CUDA
Expand Down Expand Up @@ -403,17 +423,23 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
};

TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
RETURN_IF_NOT_ENABLED;

MultiStreamRun();
CheckResult();
}

TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
RETURN_IF_NOT_ENABLED;

MultiThreadMultiStreamRun();
CheckResult();
}

#if (defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11000))
TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) {
RETURN_IF_NOT_ENABLED;

MultiStreamRun();
CUDAGraphRun();
CheckResult();
Expand Down
2 changes: 2 additions & 0 deletions test/legacy_test/test_auto_growth_allocator_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
{
'FLAGS_allocator_strategy': 'auto_growth',
'FLAGS_auto_growth_chunk_size_in_mb': 10,
# Async allocator does not support auto growth allocator.
'FLAGS_use_cuda_malloc_async_allocator': 0,
}
)

Expand Down