Merge pull request #120 from LLNL/req-streams

Associate internal streams with requests
LLNL · Mar 5, 2021 · 7ae563c · 7ae563c
2 parents fac4809 + 94e9dbe
commit 7ae563c
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 8 deletions.
diff --git a/include/aluminum/cuda.hpp b/include/aluminum/cuda.hpp
@@ -31,6 +31,7 @@
 
 #include <utility>
 #include <sstream>
+#include <functional>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
@@ -149,6 +150,16 @@ void release_cuda_event(cudaEvent_t event);
 cudaStream_t get_internal_stream();
 /** Get a specific internal stream. */
 cudaStream_t get_internal_stream(size_t id);
+/**
+ * Replace the internal stream pool with user-provided streams.
+ *
+ * stream_getter may be called an arbitrary number of times and should
+ * return the streams to use in the pool.
+ *
+ * This is meant to help interface with external applications that
+ * need Aluminum to use their streams for everything.
+ */
+void replace_internal_streams(std::function<cudaStream_t()> stream_getter);
 
 /** Return whether stream memory operations are supported. */
 bool stream_memory_operations_supported();

diff --git a/include/aluminum/ht_impl.hpp b/include/aluminum/ht_impl.hpp
@@ -86,14 +86,18 @@ void finalize();
 
 /** Represents a request for the host-transfer backend. */
 struct HostTransferRequest {
-  HostTransferRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_) :
-    op_event(op_event_), orig_stream(orig_stream_) {}
+  HostTransferRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_,
+                      cudaStream_t internal_stream_) :
+    op_event(op_event_), orig_stream(orig_stream_),
+    internal_stream(internal_stream_) {}
   // Note: Not thread safe!
   ~HostTransferRequest() { cuda::release_cuda_event(op_event); }
   /** Event pending on completion of the operation. */
   cudaEvent_t op_event;
   /** Original stream associated with the operation. */
   cudaStream_t orig_stream;
+  /** Internal stream the operation is running on. */
+  cudaStream_t internal_stream;
 };
 
 }  // namespace ht
@@ -799,7 +803,8 @@ class HostTransferBackend {
     cudaStream_t internal_stream, comm_type& comm, req_type& req) {
     cudaEvent_t event = internal::cuda::get_cuda_event();
     AL_CHECK_CUDA(cudaEventRecord(event, internal_stream));
-    req = std::make_shared<internal::ht::HostTransferRequest>(event, comm.get_stream());
+    req = std::make_shared<internal::ht::HostTransferRequest>(
+      event, comm.get_stream(), internal_stream);
   }
 
   /** Run a host-transfer allreduce. */

diff --git a/include/aluminum/nccl_impl.hpp b/include/aluminum/nccl_impl.hpp
@@ -147,14 +147,18 @@ template <> inline ncclDataType_t TypeMap<double>() { return ncclDouble; }
 
 /** Represents a request for the NCCL backend. */
 struct NCCLRequest {
-  NCCLRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_) :
-    op_event(op_event_), orig_stream(orig_stream_) {}
+  NCCLRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_,
+              cudaStream_t internal_stream_) :
+    op_event(op_event_), orig_stream(orig_stream_),
+    internal_stream(internal_stream_) {}
   // Note: Not thread safe!
   ~NCCLRequest() { cuda::release_cuda_event(op_event); }
   /** Event pending on completion of the operation. */
   cudaEvent_t op_event;
   /** Original stream associated with the operation. */
   cudaStream_t orig_stream;
+  /** Internal stream the operation is running on. */
+  cudaStream_t internal_stream;
 };
 
 /**
@@ -789,7 +793,8 @@ class NCCLBackend {
     cudaStream_t internal_stream, comm_type& comm, req_type& req) {
     cudaEvent_t event = internal::cuda::get_cuda_event();
     AL_CHECK_CUDA(cudaEventRecord(event, internal_stream));
-    req = std::make_shared<internal::nccl::NCCLRequest>(event, comm.get_stream());
+    req = std::make_shared<internal::nccl::NCCLRequest>(
+      event, comm.get_stream(), internal_stream);
   }
 
   // These are thin wrappers around the actual NCCL calls.

diff --git a/src/cuda.cpp b/src/cuda.cpp
@@ -46,6 +46,8 @@ constexpr int num_internal_streams = 5;
 cudaStream_t internal_streams[num_internal_streams];
 // Whether stream memory operations are supported.
 bool stream_mem_ops_supported = false;
+// Whether we're using external streams (these are not freed).
+bool using_external_streams = false;
 }
 
 void init(int&, char**&) {
@@ -89,8 +91,10 @@ void finalize() {
   for (auto&& event : cuda_events) {
     AL_CHECK_CUDA(cudaEventDestroy(event));
   }
-  for (int i = 0; i < num_internal_streams; ++i) {
-    AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i]));
+  if (!using_external_streams) {
+    for (int i = 0; i < num_internal_streams; ++i) {
+      AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i]));
+    }
   }
 }
 
@@ -120,6 +124,19 @@ cudaStream_t get_internal_stream(size_t id) {
   return internal_streams[id];
 }
 
+void replace_internal_streams(std::function<cudaStream_t()> stream_getter) {
+  // Clean up our streams if needed.
+  if (!using_external_streams) {
+    for (int i = 0; i < num_internal_streams; ++i) {
+      AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i]));
+    }
+  }
+  for (int i = 0; i < num_internal_streams; ++i) {
+    internal_streams[i] = stream_getter();
+  }
+  using_external_streams = true;
+}
+
 bool stream_memory_operations_supported() {
   return stream_mem_ops_supported;
 }