cuda_async_memory_resource built on cudaMallocAsync (rapidsai#676)

harrism · web-flow · commit afe237cf5690 · 2021-01-27T18:00:46.000Z
This PR adds a new device memory resource, `cuda_async_memory_resource`, which uses `cudaMallocAsync`. Closes rapidsai#671 Merging this also depends on CI support for CUDA 11.2 TODO: - [x] Extend tests and benchmarks to exercise the new resource - [x] Implement `get_mem_info` correctly. - [x] ~Consider a constructor which takes a CUDA memory pool handle to use (currently uses the default pool)~ Edit: leave this for a followup because pools have multiple parameters and requirements aren't clear. - [ ] Test on a system without cudaMallocAsync support to verify that compiling with CUDA 11.2 but running on an earlier version fails gracefully Authors: - Mark Harris (@harrism) Approvers: - Jake Hemstad (@jrhemstad) - Keith Kraus (@kkraus14) - Leo Fang (@leofang) - Rong Ou (@rongou) URL: rapidsai#676
diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/binning_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
@@ -157,6 +158,8 @@ void uniform_random_allocations(rmm::mr::device_memory_resource& mr,
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
+inline auto make_cuda_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
+
 inline auto make_pool()
 {
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
@@ -235,6 +238,9 @@ void declare_benchmark(std::string name)
 {
   if (name == "cuda")
     BENCHMARK_CAPTURE(BM_RandomAllocations, cuda_mr, &make_cuda)->Apply(benchmark_range);
+  if (name == "cuda_async")
+    BENCHMARK_CAPTURE(BM_RandomAllocations, cuda_async_mr, &make_cuda_async)
+      ->Apply(benchmark_range);
   else if (name == "binning")
     BENCHMARK_CAPTURE(BM_RandomAllocations, binning_mr, &make_binning)->Apply(benchmark_range);
   else if (name == "pool")
@@ -288,6 +294,9 @@ int main(int argc, char** argv)
     std::map<std::string, MRFactoryFunc> const funcs({{"arena", &make_arena},
                                                       {"binning", &make_binning},
                                                       {"cuda", &make_cuda},
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+                                                      {"cuda_async", &make_cuda_async},
+#endif
                                                       {"pool", &make_pool}});
     auto resource = args["resource"].as<std::string>();
 
@@ -309,7 +318,11 @@ int main(int argc, char** argv)
       std::string mr_name = args["resource"].as<std::string>();
       declare_benchmark(mr_name);
     } else {
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+      std::array<std::string, 5> mrs{"pool", "binning", "arena", "cuda_async", "cuda"};
+#else
       std::array<std::string, 4> mrs{"pool", "binning", "arena", "cuda"};
+#endif
       std::for_each(std::cbegin(mrs), std::cend(mrs), [](auto const& s) { declare_benchmark(s); });
     }
     ::benchmark::RunSpecifiedBenchmarks();
diff --git a/include/rmm/detail/error.hpp b/include/rmm/detail/error.hpp
@@ -130,7 +130,7 @@ class out_of_range : public std::out_of_range {
 #define GET_RMM_FAIL_MACRO(_1, _2, NAME, ...) NAME
 #define RMM_FAIL_2(_what, _exception_type) \
   throw _exception_type{"RMM failure at:" __FILE__ ":" RMM_STRINGIFY(__LINE__) ": " _what};
-#define RMM_FAIL_1(_what) RMM_FAIL_2(_call, rmm::logic_error)
+#define RMM_FAIL_1(_what) RMM_FAIL_2(_what, rmm::logic_error)
 
 /**
  * @brief Error checking macro for CUDA runtime API functions.
diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/detail/error.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include "rmm/cuda_stream_view.hpp"
+
+#include <cuda_runtime_api.h>
+
+#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
+#define RMM_CUDA_MALLOC_ASYNC_SUPPORT
+#endif
+
+namespace rmm {
+namespace mr {
+
+/**
+ * @brief `device_memory_resource` derived class that uses `cudaMallocAsync`/`cudaFreeAsync` for
+ * allocation/deallocation.
+ */
+class cuda_async_memory_resource final : public device_memory_resource {
+ public:
+  /**
+   * @brief Default constructor
+   *
+   * @throws rmm::runtime_error if the CUDA version does not support `cudaMallocAsync`
+   */
+  cuda_async_memory_resource()
+  {
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+    // Check if cudaMallocAsync Memory pool supported
+    int device{0};
+    RMM_CUDA_TRY(cudaGetDevice(&device));
+    int v{0};
+    auto e = cudaDeviceGetAttribute(&v, cudaDevAttrMemoryPoolsSupported, device);
+    RMM_EXPECTS(e == cudaSuccess && v == 1,
+                "cudaMallocAsync not supported with this CUDA driver/runtime version");
+#else
+    RMM_FAIL("cudaMallocAsync not supported");
+#endif
+  }
+
+  ~cuda_async_memory_resource()                                 = default;
+  cuda_async_memory_resource(cuda_async_memory_resource const&) = default;
+  cuda_async_memory_resource(cuda_async_memory_resource&&)      = default;
+  cuda_async_memory_resource& operator=(cuda_async_memory_resource const&) = default;
+  cuda_async_memory_resource& operator=(cuda_async_memory_resource&&) = default;
+
+  /**
+   * @brief Query whether the resource supports use of non-null CUDA streams for
+   * allocation/deallocation. `cuda_memory_resource` does not support streams.
+   *
+   * @returns bool true
+   */
+  bool supports_streams() const noexcept override { return true; }
+
+  /**
+   * @brief Query whether the resource supports the get_mem_info API.
+   *
+   * @return true
+   */
+  bool supports_get_mem_info() const noexcept override { return false; }
+
+ private:
+  /**
+   * @brief Allocates memory of size at least `bytes` using cudaMalloc.
+   *
+   * The returned pointer has at least 256B alignment.
+   *
+   * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
+   *
+   * @param bytes The size, in bytes, of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    void* p{nullptr};
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+    if (bytes > 0) { RMM_CUDA_TRY(cudaMallocAsync(&p, bytes, stream.value()), rmm::bad_alloc); }
+#else
+    (void)bytes;
+    (void)stream;
+#endif
+    return p;
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   */
+  void do_deallocate(void* p, std::size_t, rmm::cuda_stream_view stream) override
+  {
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+    if (p != nullptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeAsync(p, stream.value())); }
+#else
+    (void)p;
+    (void)stream;
+#endif
+  }
+
+  /**
+   * @brief Compare this resource to another.
+   *
+   * @throws Nothing.
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equal
+   */
+  bool do_is_equal(device_memory_resource const& other) const noexcept override
+  {
+    return dynamic_cast<cuda_async_memory_resource const*>(&other) != nullptr;
+  }
+
+  /**
+   * @brief Get free and available memory for memory resource
+   *
+   * @throws `rmm::cuda_error` if unable to retrieve memory info.
+   *
+   * @return std::pair contaiing free_size and total_size of memory
+   */
+  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view) const override
+  {
+    return std::make_pair(0, 0);
+  }
+};
+
+}  // namespace mr
+}  // namespace rmm
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -98,6 +98,10 @@ ConfigureTest(DEVICE_MR_TEST "${DEVICE_MR_TEST_SRC}")
 set(POOL_MR_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/pool_mr_tests.cpp")
 ConfigureTest(POOL_MR_TEST "${POOL_MR_TEST_SRC}")
 
+# cuda_async mr tests
+set(CUDA_ASYNC_MR_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/cuda_async_mr_tests.cpp")
+ConfigureTest(CUDA_ASYNC_MR_TEST "${CUDA_ASYNC_MR_TEST_SRC}")
+
 # thrust allocator tests
 
 set(THRUST_ALLOCATOR_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/thrust_allocator_tests.cu")
diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rmm/detail/error.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+namespace rmm {
+namespace test {
+namespace {
+
+using cuda_async_mr = rmm::mr::cuda_async_memory_resource;
+
+TEST(PoolTest, ThrowIfNotSupported)
+{
+  auto construct_mr = []() { cuda_async_mr mr; };
+#ifndef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+  EXPECT_THROW(construct_mr(), rmm::logic_error);
+#else
+  EXPECT_NO_THROW(construct_mr());
+#endif
+}
+
+}  // namespace
+}  // namespace test
+}  // namespace rmm
diff --git a/tests/mr/device/mr_test.hpp b/tests/mr/device/mr_test.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/binning_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/fixed_size_memory_resource.hpp>
@@ -224,6 +225,8 @@ struct mr_test : public ::testing::TestWithParam<mr_factory> {
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
+inline auto make_cuda_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
+
 inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
 
 inline auto make_pool()
diff --git a/tests/mr/device/mr_tests.cpp b/tests/mr/device/mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include <rmm/mr/device/per_device_resource.hpp>
 #include "mr_test.hpp"
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <gtest/gtest.h>
 
 namespace rmm {
@@ -26,6 +27,9 @@ namespace {
 INSTANTIATE_TEST_CASE_P(ResourceTests,
                         mr_test,
                         ::testing::Values(mr_factory{"CUDA", &make_cuda},
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+                                          mr_factory{"CUDA_Async", &make_cuda_async},
+#endif
                                           mr_factory{"Managed", &make_managed},
                                           mr_factory{"Pool", &make_pool},
                                           mr_factory{"Arena", &make_arena},
diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
+#include <rmm/detail/aligned.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/limiting_resource_adaptor.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
-#include "rmm/detail/aligned.hpp"
-#include "rmm/mr/device/limiting_resource_adaptor.hpp"
 
 #include <gtest/gtest.h>