Skip to content

Commit efd4c08

Browse files
authored
Add a CUDA stream pool(rapidsai#659)
Closes rapidsai#613. Adds a simple pool class which contains a circular buffer of `cuda_stream` objects with a `get_stream` method to get a view of one of the streams. Also adds tests and a simple benchmark. Benchmark results. First line is getting a stream from the stream pool. Second is using the RAII `rmm::cuda_stream()` class which calls `cudaStreamCreate()` and `cudaStreamDestroy()`. ``` --------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------- BM_StreamPoolGetStream 0.240 us 0.240 us 2918230 items_per_second=4.16296M/s BM_CudaStreamClass 3.01 us 2.90 us 249073 items_per_second=344.521k/s ``` TODO in a follow up: - Cython/Python bindings - Stream priorities and non-blocking streams, once support is added to `rmm::cuda_stream()`. Authors: - Mark Harris <[email protected]> Approvers: - Keith Kraus - AJ Schmidt - Rong Ou - Jake Hemstad URL: rapidsai#659
1 parent c306e60 commit efd4c08

File tree

8 files changed

+199
-4
lines changed

8 files changed

+199
-4
lines changed

README.md

+8
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,14 @@ RAII semantics (constructor creates the CUDA stream, destructor destroys it). An
213213
can never represent the CUDA default stream or per-thread default stream; it only ever represents
214214
a single non-default stream. `rmm::cuda_stream` cannot be copied, but can be moved.
215215

216+
## `cuda_stream_pool`
217+
218+
`rmm::cuda_stream_pool` provides fast access to a pool of CUDA streams. This class can be used to
219+
create a set of `cuda_stream` objects whose lifetime is equal to the `cuda_stream_pool`. Using the
220+
stream pool can be faster than creating the streams on the fly. The size of the pool is configurable.
221+
Depending on this size, multiple calls to `cuda_stream_pool::get_stream()` may return instances of
222+
`rmm::cuda_stream_view` that represent identical CUDA streams.
223+
216224
### Thread Safety
217225

218226
All current device memory resources are thread safe unless documented otherwise. More specifically,

benchmarks/CMakeLists.txt

+7
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,10 @@ ConfigureBench(REPLAY_BENCH "${REPLAY_BENCH_SRC}")
8989
set(UVECTOR_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/device_uvector/device_uvector_bench.cu")
9090

9191
ConfigureBench(UVECTOR_BENCH "${UVECTOR_BENCH_SRC}")
92+
93+
# cuda_stream_pool benchmark
94+
95+
set(CUDA_STREAM_POOL_BENCH_SRC
96+
"${CMAKE_CURRENT_SOURCE_DIR}/cuda_stream_pool/cuda_stream_pool_bench.cpp")
97+
98+
ConfigureBench(CUDA_STREAM_POOL_BENCH "${CUDA_STREAM_POOL_BENCH_SRC}")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
* Copyright (c) 2020, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either ex ess or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <benchmark/benchmark.h>
18+
19+
#include <rmm/cuda_stream_pool.hpp>
20+
#include <rmm/detail/error.hpp>
21+
22+
#include <cuda_runtime_api.h>
23+
24+
#include <stdexcept>
25+
26+
static void BM_StreamPoolGetStream(benchmark::State& state)
27+
{
28+
rmm::cuda_stream_pool stream_pool{};
29+
30+
for (auto _ : state) {
31+
auto s = stream_pool.get_stream();
32+
auto e = cudaStreamQuery(s.value());
33+
}
34+
35+
state.SetItemsProcessed(state.iterations());
36+
}
37+
BENCHMARK(BM_StreamPoolGetStream)->Unit(benchmark::kMicrosecond);
38+
39+
static void BM_CudaStreamClass(benchmark::State& state)
40+
{
41+
for (auto _ : state) {
42+
auto s = rmm::cuda_stream{};
43+
auto e = cudaStreamQuery(s.view().value());
44+
}
45+
46+
state.SetItemsProcessed(state.iterations());
47+
}
48+
BENCHMARK(BM_CudaStreamClass)->Unit(benchmark::kMicrosecond);
49+
50+
BENCHMARK_MAIN();

conda/recipes/librmm/meta.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ test:
3636
- test -f $PREFIX/include/rmm/logger.hpp
3737
- test -f $PREFIX/include/rmm/cuda_stream.hpp
3838
- test -f $PREFIX/include/rmm/cuda_stream_view.hpp
39+
- test -f $PREFIX/include/rmm/cuda_stream_pool.hpp
3940
- test -f $PREFIX/include/rmm/device_uvector.hpp
4041
- test -f $PREFIX/include/rmm/device_scalar.hpp
4142
- test -f $PREFIX/include/rmm/device_buffer.hpp

include/rmm/cuda_stream_pool.hpp

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
* Copyright (c) 2020, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <rmm/cuda_stream.hpp>
20+
#include <rmm/cuda_stream_view.hpp>
21+
22+
#include <atomic>
23+
#include <vector>
24+
25+
namespace rmm {
26+
27+
/**
28+
* @brief A pool of CUDA streams.
29+
*
30+
* Provides efficient access to collection of CUDA stream objects.
31+
*
32+
* Successive calls may return a `cuda_stream_view` of identical streams. For example, a possible
33+
* implementation is to maintain a circular buffer of `cuda_stream` objects.
34+
*/
35+
class cuda_stream_pool {
36+
public:
37+
static constexpr std::size_t default_size{16}; ///< Default stream pool size
38+
39+
/**
40+
* @brief Construct a new cuda stream pool object of the given size
41+
*
42+
* @param pool_size The number of streams in the pool
43+
*/
44+
explicit cuda_stream_pool(std::size_t pool_size = default_size) : streams_(pool_size) {}
45+
~cuda_stream_pool() = default;
46+
47+
cuda_stream_pool(cuda_stream_pool&&) = delete;
48+
cuda_stream_pool(cuda_stream_pool const&) = delete;
49+
cuda_stream_pool& operator=(cuda_stream_pool&&) = delete;
50+
cuda_stream_pool& operator=(cuda_stream_pool const&) = delete;
51+
52+
/**
53+
* @brief Get a `cuda_stream_view` of a stream in the pool.
54+
*
55+
* This function is thread safe with respect to other calls to the same function.
56+
*
57+
* @return rmm::cuda_stream_view
58+
*/
59+
rmm::cuda_stream_view get_stream() const noexcept
60+
{
61+
return streams_[(next_stream++) % streams_.size()].view();
62+
}
63+
64+
private:
65+
std::vector<rmm::cuda_stream> streams_;
66+
mutable std::atomic_size_t next_stream{};
67+
};
68+
69+
} // namespace rmm

tests/CMakeLists.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,8 @@ ConfigureTest(HOST_MR_TEST "${HOST_MR_TEST_SRC}")
131131

132132
# cuda stream tests
133133

134-
set(CUDA_STREAM_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cuda_stream_tests.cpp")
135-
134+
set(CUDA_STREAM_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cuda_stream_tests.cpp"
135+
"${CMAKE_CURRENT_SOURCE_DIR}/cuda_stream_pool_tests.cpp")
136136
ConfigureTest(CUDA_STREAM_TEST "${CUDA_STREAM_TEST_SRC}")
137137

138138
# device buffer tests

tests/cuda_stream_pool_tests.cpp

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Copyright (c) 2020, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <rmm/cuda_stream_pool.hpp>
18+
#include <rmm/detail/error.hpp>
19+
#include <rmm/device_uvector.hpp>
20+
21+
#include <gtest/gtest.h>
22+
23+
#include <cuda_runtime_api.h>
24+
25+
struct CudaStreamPoolTest : public ::testing::Test {
26+
rmm::cuda_stream_pool pool{};
27+
};
28+
29+
TEST_F(CudaStreamPoolTest, Unequal)
30+
{
31+
auto const stream_a = this->pool.get_stream();
32+
auto const stream_b = this->pool.get_stream();
33+
34+
EXPECT_NE(stream_a, stream_b);
35+
}
36+
37+
TEST_F(CudaStreamPoolTest, Nondefault)
38+
{
39+
auto const stream_a = this->pool.get_stream();
40+
auto const stream_b = this->pool.get_stream();
41+
42+
// pool streams are explicit, non-default streams
43+
EXPECT_FALSE(stream_a.is_default());
44+
EXPECT_FALSE(stream_a.is_per_thread_default());
45+
}
46+
47+
TEST_F(CudaStreamPoolTest, ValidStreams)
48+
{
49+
auto const stream_a = this->pool.get_stream();
50+
auto const stream_b = this->pool.get_stream();
51+
52+
// Operations on the streams should work correctly and without throwing exceptions
53+
auto v = rmm::device_uvector<std::uint8_t>{100, stream_a};
54+
RMM_CUDA_TRY(cudaMemsetAsync(v.data(), 0xcc, 100, stream_a.value()));
55+
stream_a.synchronize();
56+
57+
auto v2 = rmm::device_uvector<uint8_t>{v, stream_b};
58+
auto x = v2.front_element(stream_b);
59+
EXPECT_EQ(x, 0xcc);
60+
}

tests/cuda_stream_tests.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@
1414
* limitations under the License.
1515
*/
1616

17-
#include "gtest/gtest.h"
18-
1917
#include <rmm/cuda_stream.hpp>
2018
#include <rmm/cuda_stream_view.hpp>
2119
#include <rmm/device_buffer.hpp>
2220

2321
#include <cuda_runtime_api.h>
2422

23+
#include <gtest/gtest.h>
24+
2525
struct CudaStreamTest : public ::testing::Test {
2626
};
2727

0 commit comments

Comments
 (0)