Add a CUDA stream pool(rapidsai#659)

harrism · web-flow · commit efd4c08b4bd4 · 2020-12-16T04:37:29.000Z
Closes rapidsai#613. Adds a simple pool class which contains a circular buffer of `cuda_stream` objects with a `get_stream` method to get a view of one of the streams. Also adds tests and a simple benchmark. Benchmark results. First line is getting a stream from the stream pool. Second is using the RAII `rmm::cuda_stream()` class which calls `cudaStreamCreate()` and `cudaStreamDestroy()`. ``` --------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------- BM_StreamPoolGetStream 0.240 us 0.240 us 2918230 items_per_second=4.16296M/s BM_CudaStreamClass 3.01 us 2.90 us 249073 items_per_second=344.521k/s ``` TODO in a follow up: - Cython/Python bindings - Stream priorities and non-blocking streams, once support is added to `rmm::cuda_stream()`. Authors: - Mark Harris <mharris@nvidia.com> Approvers: - Keith Kraus - AJ Schmidt - Rong Ou - Jake Hemstad URL: rapidsai#659
diff --git a/README.md b/README.md
@@ -213,6 +213,14 @@ RAII semantics (constructor creates the CUDA stream, destructor destroys it). An
 can never represent the CUDA default stream or per-thread default stream; it only ever represents
 a single non-default stream. `rmm::cuda_stream` cannot be copied, but can be moved.
 
+## `cuda_stream_pool`
+
+`rmm::cuda_stream_pool` provides fast access to a pool of CUDA streams. This class can be used to 
+create a set of `cuda_stream` objects whose lifetime is equal to the `cuda_stream_pool`. Using the 
+stream pool can be faster than creating the streams on the fly. The size of the pool is configurable.
+Depending on this size, multiple calls to `cuda_stream_pool::get_stream()` may return instances of 
+`rmm::cuda_stream_view` that represent identical CUDA streams.
+
 ### Thread Safety
 
 All current device memory resources are thread safe unless documented otherwise. More specifically,
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -89,3 +89,10 @@ ConfigureBench(REPLAY_BENCH "${REPLAY_BENCH_SRC}")
 set(UVECTOR_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/device_uvector/device_uvector_bench.cu")
 
 ConfigureBench(UVECTOR_BENCH "${UVECTOR_BENCH_SRC}")
+
+# cuda_stream_pool benchmark
+
+set(CUDA_STREAM_POOL_BENCH_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/cuda_stream_pool/cuda_stream_pool_bench.cpp")
+
+ConfigureBench(CUDA_STREAM_POOL_BENCH "${CUDA_STREAM_POOL_BENCH_SRC}")
diff --git a/benchmarks/cuda_stream_pool/cuda_stream_pool_bench.cpp b/benchmarks/cuda_stream_pool/cuda_stream_pool_bench.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either ex  ess or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/detail/error.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <stdexcept>
+
+static void BM_StreamPoolGetStream(benchmark::State& state)
+{
+  rmm::cuda_stream_pool stream_pool{};
+
+  for (auto _ : state) {
+    auto s = stream_pool.get_stream();
+    auto e = cudaStreamQuery(s.value());
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+BENCHMARK(BM_StreamPoolGetStream)->Unit(benchmark::kMicrosecond);
+
+static void BM_CudaStreamClass(benchmark::State& state)
+{
+  for (auto _ : state) {
+    auto s = rmm::cuda_stream{};
+    auto e = cudaStreamQuery(s.view().value());
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+BENCHMARK(BM_CudaStreamClass)->Unit(benchmark::kMicrosecond);
+
+BENCHMARK_MAIN();
diff --git a/conda/recipes/librmm/meta.yaml b/conda/recipes/librmm/meta.yaml
@@ -36,6 +36,7 @@ test:
     - test -f $PREFIX/include/rmm/logger.hpp
     - test -f $PREFIX/include/rmm/cuda_stream.hpp
     - test -f $PREFIX/include/rmm/cuda_stream_view.hpp
+    - test -f $PREFIX/include/rmm/cuda_stream_pool.hpp
     - test -f $PREFIX/include/rmm/device_uvector.hpp
     - test -f $PREFIX/include/rmm/device_scalar.hpp
     - test -f $PREFIX/include/rmm/device_buffer.hpp
diff --git a/include/rmm/cuda_stream_pool.hpp b/include/rmm/cuda_stream_pool.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+#include <atomic>
+#include <vector>
+
+namespace rmm {
+
+/**
+ * @brief A pool of CUDA streams.
+ *
+ * Provides efficient access to collection of CUDA stream objects.
+ *
+ * Successive calls may return a `cuda_stream_view` of identical streams. For example, a possible
+ * implementation is to maintain a circular buffer of `cuda_stream` objects.
+ */
+class cuda_stream_pool {
+ public:
+  static constexpr std::size_t default_size{16};  ///< Default stream pool size
+
+  /**
+   * @brief Construct a new cuda stream pool object of the given size
+   *
+   * @param pool_size The number of streams in the pool
+   */
+  explicit cuda_stream_pool(std::size_t pool_size = default_size) : streams_(pool_size) {}
+  ~cuda_stream_pool() = default;
+
+  cuda_stream_pool(cuda_stream_pool&&)      = delete;
+  cuda_stream_pool(cuda_stream_pool const&) = delete;
+  cuda_stream_pool& operator=(cuda_stream_pool&&) = delete;
+  cuda_stream_pool& operator=(cuda_stream_pool const&) = delete;
+
+  /**
+   * @brief Get a `cuda_stream_view` of a stream in the pool.
+   *
+   * This function is thread safe with respect to other calls to the same function.
+   *
+   * @return rmm::cuda_stream_view
+   */
+  rmm::cuda_stream_view get_stream() const noexcept
+  {
+    return streams_[(next_stream++) % streams_.size()].view();
+  }
+
+ private:
+  std::vector<rmm::cuda_stream> streams_;
+  mutable std::atomic_size_t next_stream{};
+};
+
+}  // namespace rmm
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -131,8 +131,8 @@ ConfigureTest(HOST_MR_TEST "${HOST_MR_TEST_SRC}")
 
 # cuda stream tests
 
-set(CUDA_STREAM_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cuda_stream_tests.cpp")
-
+set(CUDA_STREAM_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cuda_stream_tests.cpp"
+                         "${CMAKE_CURRENT_SOURCE_DIR}/cuda_stream_pool_tests.cpp")
 ConfigureTest(CUDA_STREAM_TEST "${CUDA_STREAM_TEST_SRC}")
 
 # device buffer tests
diff --git a/tests/cuda_stream_pool_tests.cpp b/tests/cuda_stream_pool_tests.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/detail/error.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cuda_runtime_api.h>
+
+struct CudaStreamPoolTest : public ::testing::Test {
+  rmm::cuda_stream_pool pool{};
+};
+
+TEST_F(CudaStreamPoolTest, Unequal)
+{
+  auto const stream_a = this->pool.get_stream();
+  auto const stream_b = this->pool.get_stream();
+
+  EXPECT_NE(stream_a, stream_b);
+}
+
+TEST_F(CudaStreamPoolTest, Nondefault)
+{
+  auto const stream_a = this->pool.get_stream();
+  auto const stream_b = this->pool.get_stream();
+
+  // pool streams are explicit, non-default streams
+  EXPECT_FALSE(stream_a.is_default());
+  EXPECT_FALSE(stream_a.is_per_thread_default());
+}
+
+TEST_F(CudaStreamPoolTest, ValidStreams)
+{
+  auto const stream_a = this->pool.get_stream();
+  auto const stream_b = this->pool.get_stream();
+
+  // Operations on the streams should work correctly and without throwing exceptions
+  auto v = rmm::device_uvector<std::uint8_t>{100, stream_a};
+  RMM_CUDA_TRY(cudaMemsetAsync(v.data(), 0xcc, 100, stream_a.value()));
+  stream_a.synchronize();
+
+  auto v2 = rmm::device_uvector<uint8_t>{v, stream_b};
+  auto x  = v2.front_element(stream_b);
+  EXPECT_EQ(x, 0xcc);
+}
diff --git a/tests/cuda_stream_tests.cpp b/tests/cuda_stream_tests.cpp
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include "gtest/gtest.h"
-
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <cuda_runtime_api.h>
 
+#include <gtest/gtest.h>
+
 struct CudaStreamTest : public ::testing::Test {
 };