Skip to content

Commit

Permalink
Merge pull request #120 from LLNL/req-streams
Browse files Browse the repository at this point in the history
Associate internal streams with requests
  • Loading branch information
ndryden authored Mar 5, 2021
2 parents fac4809 + 94e9dbe commit 7ae563c
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 8 deletions.
11 changes: 11 additions & 0 deletions include/aluminum/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

#include <utility>
#include <sstream>
#include <functional>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
Expand Down Expand Up @@ -149,6 +150,16 @@ void release_cuda_event(cudaEvent_t event);
cudaStream_t get_internal_stream();
/** Get a specific internal stream. */
cudaStream_t get_internal_stream(size_t id);
/**
* Replace the internal stream pool with user-provided streams.
*
* stream_getter may be called an arbitrary number of times and should
* return the streams to use in the pool.
*
* This is meant to help interface with external applications that
* need Aluminum to use their streams for everything.
*/
void replace_internal_streams(std::function<cudaStream_t()> stream_getter);

/** Return whether stream memory operations are supported. */
bool stream_memory_operations_supported();
Expand Down
11 changes: 8 additions & 3 deletions include/aluminum/ht_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,18 @@ void finalize();

/** Represents a request for the host-transfer backend. */
struct HostTransferRequest {
HostTransferRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_) :
op_event(op_event_), orig_stream(orig_stream_) {}
HostTransferRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_,
cudaStream_t internal_stream_) :
op_event(op_event_), orig_stream(orig_stream_),
internal_stream(internal_stream_) {}
// Note: Not thread safe!
~HostTransferRequest() { cuda::release_cuda_event(op_event); }
/** Event pending on completion of the operation. */
cudaEvent_t op_event;
/** Original stream associated with the operation. */
cudaStream_t orig_stream;
/** Internal stream the operation is running on. */
cudaStream_t internal_stream;
};

} // namespace ht
Expand Down Expand Up @@ -799,7 +803,8 @@ class HostTransferBackend {
cudaStream_t internal_stream, comm_type& comm, req_type& req) {
cudaEvent_t event = internal::cuda::get_cuda_event();
AL_CHECK_CUDA(cudaEventRecord(event, internal_stream));
req = std::make_shared<internal::ht::HostTransferRequest>(event, comm.get_stream());
req = std::make_shared<internal::ht::HostTransferRequest>(
event, comm.get_stream(), internal_stream);
}

/** Run a host-transfer allreduce. */
Expand Down
11 changes: 8 additions & 3 deletions include/aluminum/nccl_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,18 @@ template <> inline ncclDataType_t TypeMap<double>() { return ncclDouble; }

/** Represents a request for the NCCL backend. */
struct NCCLRequest {
NCCLRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_) :
op_event(op_event_), orig_stream(orig_stream_) {}
NCCLRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_,
cudaStream_t internal_stream_) :
op_event(op_event_), orig_stream(orig_stream_),
internal_stream(internal_stream_) {}
// Note: Not thread safe!
~NCCLRequest() { cuda::release_cuda_event(op_event); }
/** Event pending on completion of the operation. */
cudaEvent_t op_event;
/** Original stream associated with the operation. */
cudaStream_t orig_stream;
/** Internal stream the operation is running on. */
cudaStream_t internal_stream;
};

/**
Expand Down Expand Up @@ -789,7 +793,8 @@ class NCCLBackend {
cudaStream_t internal_stream, comm_type& comm, req_type& req) {
cudaEvent_t event = internal::cuda::get_cuda_event();
AL_CHECK_CUDA(cudaEventRecord(event, internal_stream));
req = std::make_shared<internal::nccl::NCCLRequest>(event, comm.get_stream());
req = std::make_shared<internal::nccl::NCCLRequest>(
event, comm.get_stream(), internal_stream);
}

// These are thin wrappers around the actual NCCL calls.
Expand Down
21 changes: 19 additions & 2 deletions src/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ constexpr int num_internal_streams = 5;
cudaStream_t internal_streams[num_internal_streams];
// Whether stream memory operations are supported.
bool stream_mem_ops_supported = false;
// Whether we're using external streams (these are not freed).
bool using_external_streams = false;
}

void init(int&, char**&) {
Expand Down Expand Up @@ -89,8 +91,10 @@ void finalize() {
for (auto&& event : cuda_events) {
AL_CHECK_CUDA(cudaEventDestroy(event));
}
for (int i = 0; i < num_internal_streams; ++i) {
AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i]));
if (!using_external_streams) {
for (int i = 0; i < num_internal_streams; ++i) {
AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i]));
}
}
}

Expand Down Expand Up @@ -120,6 +124,19 @@ cudaStream_t get_internal_stream(size_t id) {
return internal_streams[id];
}

void replace_internal_streams(std::function<cudaStream_t()> stream_getter) {
// Clean up our streams if needed.
if (!using_external_streams) {
for (int i = 0; i < num_internal_streams; ++i) {
AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i]));
}
}
for (int i = 0; i < num_internal_streams; ++i) {
internal_streams[i] = stream_getter();
}
using_external_streams = true;
}

bool stream_memory_operations_supported() {
return stream_mem_ops_supported;
}
Expand Down

0 comments on commit 7ae563c

Please sign in to comment.