Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support allocate CUDA managed memory #39075

Merged
17 changes: 14 additions & 3 deletions paddle/fluid/memory/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,29 @@ if (WITH_GPU)
nv_test(stream_safe_cuda_alloc_test
SRCS stream_safe_cuda_alloc_test.cu
DEPS malloc cuda_graph_with_memory_pool)
nv_test(cuda_managed_memory_test
SRCS cuda_managed_memory_test.cu
DEPS malloc gpu_info place)

if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES
ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;
FLAGS_allocator_strategy=auto_growth")
endif()
ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;FLAGS_allocator_strategy=auto_growth")
endif()
endif()

if (WITH_ROCM)
hip_test(malloc_test
SRCS malloc_test.cu
DEPS device_context malloc)
hip_test(cuda_managed_memory_test
SRCS cuda_managed_memory_test.cu
DEPS malloc gpu_info place)
endif()

if(WITH_TESTING AND TEST cuda_managed_memory_test)
set_tests_properties(cuda_managed_memory_test PROPERTIES
ENVIRONMENT "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth"
TIMEOUT 50)
endif()

if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ endif()

if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
Expand All @@ -27,6 +28,7 @@ endif()

if (WITH_ROCM)
hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
Expand All @@ -42,7 +44,7 @@ endif()
cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)

if (WITH_GPU OR WITH_ROCM)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context)
set(AllocatorFacadeDeps gpu_info cuda_allocator cuda_managed_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context)
if(CUDA_VERSION GREATER_EQUAL 10.2)
list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
endif()
Expand Down
55 changes: 44 additions & 11 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <shared_mutex>
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
Expand Down Expand Up @@ -80,6 +81,11 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
"Enable StreamSafeCUDAAllocator");

PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory, false,
"Whether to use CUDAManagedAllocator to allocate "
"managed memory, only available for auto_growth "
"strategy");

DECLARE_string(allocator_strategy);

namespace paddle {
Expand Down Expand Up @@ -436,6 +442,37 @@ class AllocatorFacadePrivate {
std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
}

void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}

// Create a new CUDAAllocator or CUDAManagedAllocator for the given device
std::shared_ptr<Allocator> CreateCUDAAllocator(platform::CUDAPlace p) {
if (FLAGS_use_cuda_managed_memory) {
PADDLE_ENFORCE_EQ(
strategy_, AllocatorStrategy::kAutoGrowth,
platform::errors::InvalidArgument(
"CUDA managed memory is only implemented for auto_growth "
"strategy, not support %s strategy.\n"
"Please use auto_growth strategy by command `export "
"FLAGS_allocator_strategy=\"auto_growth\"`, or disable managed "
"memory by command `export FLAGS_use_cuda_managed_memory=false`",
FLAGS_allocator_strategy));

if (!platform::IsGPUManagedMemorySupported(p.device)) {
PADDLE_THROW(platform::errors::Unavailable(
"Failed to create CUDAManagedAllocator on GPU %d.\n\n"
"You have enabled CUDA managed memory, but the gpu device does not "
"support allocating managed memory.\n"
"If you don't actually need to use managed memory, please disable "
"it with command `export FLAGS_use_cuda_managed_memory=false`.\n"
"Or you must use the gpu device that supports managed memory."));
}
return std::make_shared<CUDAManagedAllocator>(p);
}
return std::make_shared<CUDAAllocator>(p);
}

void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
PADDLE_ENFORCE_EQ(
strategy_, AllocatorStrategy::kAutoGrowth,
Expand All @@ -452,13 +489,9 @@ class AllocatorFacadePrivate {
}
}

void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}

void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
#if defined(PADDLE_WITH_HIP)
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
auto cuda_allocator = CreateCUDAAllocator(p);
cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
#endif
Expand All @@ -485,14 +518,14 @@ class AllocatorFacadePrivate {
std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), p);
} else {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
auto cuda_allocator = CreateCUDAAllocator(p);
cuda_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(),
allow_free_idle_chunk_);
}
#else
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
auto cuda_allocator = CreateCUDAAllocator(p);
auto alignment = platform::GpuMinChunkSize();
bool need_addr_align = true;
// NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
Expand Down Expand Up @@ -535,7 +568,7 @@ class AllocatorFacadePrivate {
void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
bool allow_free_idle_chunk) {
#if defined(PADDLE_WITH_HIP)
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
auto cuda_allocator = CreateCUDAAllocator(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
#endif
Expand All @@ -562,13 +595,13 @@ class AllocatorFacadePrivate {
std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), p);
} else {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
auto cuda_allocator = CreateCUDAAllocator(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
}

#else
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
auto cuda_allocator = CreateCUDAAllocator(p);
auto alignment = platform::GpuMinChunkSize();
bool need_addr_align = true;
// NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
Expand Down Expand Up @@ -690,7 +723,7 @@ class AllocatorFacadePrivate {
int device_count = platform::GetGPUDeviceCount();
for (int i = 0; i < device_count; ++i) {
platform::CUDAPlace p(i);
system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
system_allocators_[p] = CreateCUDAAllocator(p);
}
#endif
#ifdef PADDLE_WITH_MLU
Expand Down
12 changes: 10 additions & 2 deletions paddle/fluid/memory/allocation/cuda_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,16 +67,24 @@ pten::Allocation* CUDAAllocator::AllocateImpl(size_t size) {
limit_size, limit_size);
}

std::string managed_memory_msg;
if (platform::IsGPUManagedMemoryOversubscriptionSupported(place_.device)) {
managed_memory_msg = string::Sprintf(
"If the above ways do not solve the out of memory problem, you can try "
"to use CUDA managed memory. The command is `export "
"FLAGS_use_cuda_managed_memory=false`.");
}

PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on GPU %d. "
"Cannot allocate %s memory on GPU %d, %s memory has been allocated and "
"available memory is only %s.\n\n"
"Please check whether there is any other process using GPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
"2. If no, please decrease the batch size of your model. %s\n\n",
"2. If no, please decrease the batch size of your model. %s\n%s\n",
place_.device, string::HumanReadableSize(size), place_.device,
string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
place_.device, err_msg));
place_.device, err_msg, managed_memory_msg));
}

} // namespace allocation
Expand Down
86 changes: 86 additions & 0 deletions paddle/fluid/memory/allocation/cuda_managed_allocator.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"

#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif

#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif

#include <string>
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"

namespace paddle {
namespace memory {
namespace allocation {
bool CUDAManagedAllocator::IsAllocThreadSafe() const { return true; }

void CUDAManagedAllocator::FreeImpl(pten::Allocation* allocation) {
PADDLE_ENFORCE_EQ(
allocation->place(), place_,
platform::errors::PermissionDenied(
"GPU memory is freed in incorrect device. This may be a bug"));
platform::RecordedGpuFree(allocation->ptr(), allocation->size(),
place_.device);
delete allocation;
}

pten::Allocation* CUDAManagedAllocator::AllocateImpl(size_t size) {
std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); });

int dev_id = place_.device;
void* ptr;
auto result = platform::RecordedGpuMalloc(&ptr, size, dev_id,
/* malloc_managed_memory = */ true);
if (LIKELY(result == gpuSuccess)) {
return new Allocation(ptr, size, platform::Place(place_));
}

uint64_t limit_size = platform::RecordedGpuLimitSize(dev_id);
uint64_t malloc_size = platform::RecordedGpuMallocSize(dev_id);
bool is_limited =
platform::IsGpuMallocRecorded(dev_id) && malloc_size + size > limit_size;

std::string err_msg;
if (UNLIKELY(is_limited)) {
int64_t limit_size_mb = limit_size >> 20;
err_msg = string::Sprintf(
"Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
"value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
"GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size_mb, limit_size_mb);
}

PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on GPU %d. "
"Cannot allocate %s CUDA managed memory on GPU %d, %s memory has been "
"allocated.\n\n"
"Please check whether there is any other process using GPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
"2. If no, please decrease the batch size of your model. %s\n\n",
dev_id, string::HumanReadableSize(size), dev_id,
string::HumanReadableSize(malloc_size), dev_id, err_msg));
}

} // namespace allocation
} // namespace memory
} // namespace paddle
41 changes: 41 additions & 0 deletions paddle/fluid/memory/allocation/cuda_managed_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"

namespace paddle {
namespace memory {
namespace allocation {

class CUDAManagedAllocator : public Allocator {
public:
explicit CUDAManagedAllocator(const platform::CUDAPlace& place)
: place_(place) {}

bool IsAllocThreadSafe() const override;

protected:
void FreeImpl(pten::Allocation* allocation) override;
pten::Allocation* AllocateImpl(size_t size) override;

private:
platform::CUDAPlace place_;
std::once_flag once_flag_;
};

} // namespace allocation
} // namespace memory
} // namespace paddle
Loading