From 006a6bc053bcbc0f2f56b33a9c9b9d5cdc525ccb Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Fri, 26 Feb 2021 06:12:05 -0800 Subject: [PATCH 1/2] Store internal stream in request objects. --- include/aluminum/ht_impl.hpp | 11 ++++++++--- include/aluminum/nccl_impl.hpp | 11 ++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/aluminum/ht_impl.hpp b/include/aluminum/ht_impl.hpp index c09f2926..a08c164f 100644 --- a/include/aluminum/ht_impl.hpp +++ b/include/aluminum/ht_impl.hpp @@ -86,14 +86,18 @@ void finalize(); /** Represents a request for the host-transfer backend. */ struct HostTransferRequest { - HostTransferRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_) : - op_event(op_event_), orig_stream(orig_stream_) {} + HostTransferRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_, + cudaStream_t internal_stream_) : + op_event(op_event_), orig_stream(orig_stream_), + internal_stream(internal_stream_) {} // Note: Not thread safe! ~HostTransferRequest() { cuda::release_cuda_event(op_event); } /** Event pending on completion of the operation. */ cudaEvent_t op_event; /** Original stream associated with the operation. */ cudaStream_t orig_stream; + /** Internal stream the operation is running on. */ + cudaStream_t internal_stream; }; } // namespace ht @@ -799,7 +803,8 @@ class HostTransferBackend { cudaStream_t internal_stream, comm_type& comm, req_type& req) { cudaEvent_t event = internal::cuda::get_cuda_event(); AL_CHECK_CUDA(cudaEventRecord(event, internal_stream)); - req = std::make_shared(event, comm.get_stream()); + req = std::make_shared( + event, comm.get_stream(), internal_stream); } /** Run a host-transfer allreduce. */ diff --git a/include/aluminum/nccl_impl.hpp b/include/aluminum/nccl_impl.hpp index bb8f460f..97106501 100644 --- a/include/aluminum/nccl_impl.hpp +++ b/include/aluminum/nccl_impl.hpp @@ -147,14 +147,18 @@ template <> inline ncclDataType_t TypeMap() { return ncclDouble; } /** Represents a request for the NCCL backend. */ struct NCCLRequest { - NCCLRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_) : - op_event(op_event_), orig_stream(orig_stream_) {} + NCCLRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_, + cudaStream_t internal_stream_) : + op_event(op_event_), orig_stream(orig_stream_), + internal_stream(internal_stream_) {} // Note: Not thread safe! ~NCCLRequest() { cuda::release_cuda_event(op_event); } /** Event pending on completion of the operation. */ cudaEvent_t op_event; /** Original stream associated with the operation. */ cudaStream_t orig_stream; + /** Internal stream the operation is running on. */ + cudaStream_t internal_stream; }; /** @@ -789,7 +793,8 @@ class NCCLBackend { cudaStream_t internal_stream, comm_type& comm, req_type& req) { cudaEvent_t event = internal::cuda::get_cuda_event(); AL_CHECK_CUDA(cudaEventRecord(event, internal_stream)); - req = std::make_shared(event, comm.get_stream()); + req = std::make_shared( + event, comm.get_stream(), internal_stream); } // These are thin wrappers around the actual NCCL calls. From 94e9dbe87e11f2bc5075a5cbff5ae7961bbaa57a Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Thu, 4 Mar 2021 12:27:57 -0800 Subject: [PATCH 2/2] Support replacing the internal stream pool. --- include/aluminum/cuda.hpp | 11 +++++++++++ src/cuda.cpp | 21 +++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/include/aluminum/cuda.hpp b/include/aluminum/cuda.hpp index e6323b34..2bab5ed3 100644 --- a/include/aluminum/cuda.hpp +++ b/include/aluminum/cuda.hpp @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -149,6 +150,16 @@ void release_cuda_event(cudaEvent_t event); cudaStream_t get_internal_stream(); /** Get a specific internal stream. */ cudaStream_t get_internal_stream(size_t id); +/** + * Replace the internal stream pool with user-provided streams. + * + * stream_getter may be called an arbitrary number of times and should + * return the streams to use in the pool. + * + * This is meant to help interface with external applications that + * need Aluminum to use their streams for everything. + */ +void replace_internal_streams(std::function stream_getter); /** Return whether stream memory operations are supported. */ bool stream_memory_operations_supported(); diff --git a/src/cuda.cpp b/src/cuda.cpp index 9329fdce..f3cdec10 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -46,6 +46,8 @@ constexpr int num_internal_streams = 5; cudaStream_t internal_streams[num_internal_streams]; // Whether stream memory operations are supported. bool stream_mem_ops_supported = false; +// Whether we're using external streams (these are not freed). +bool using_external_streams = false; } void init(int&, char**&) { @@ -89,8 +91,10 @@ void finalize() { for (auto&& event : cuda_events) { AL_CHECK_CUDA(cudaEventDestroy(event)); } - for (int i = 0; i < num_internal_streams; ++i) { - AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i])); + if (!using_external_streams) { + for (int i = 0; i < num_internal_streams; ++i) { + AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i])); + } } } @@ -120,6 +124,19 @@ cudaStream_t get_internal_stream(size_t id) { return internal_streams[id]; } +void replace_internal_streams(std::function stream_getter) { + // Clean up our streams if needed. + if (!using_external_streams) { + for (int i = 0; i < num_internal_streams; ++i) { + AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i])); + } + } + for (int i = 0; i < num_internal_streams; ++i) { + internal_streams[i] = stream_getter(); + } + using_external_streams = true; +} + bool stream_memory_operations_supported() { return stream_mem_ops_supported; }