From 006a6bc053bcbc0f2f56b33a9c9b9d5cdc525ccb Mon Sep 17 00:00:00 2001
From: Nikoli Dryden <ndryden@users.noreply.github.com>
Date: Fri, 26 Feb 2021 06:12:05 -0800
Subject: [PATCH 1/2] Store internal stream in request objects.

---
 include/aluminum/ht_impl.hpp   | 11 ++++++++---
 include/aluminum/nccl_impl.hpp | 11 ++++++++---
 2 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/include/aluminum/ht_impl.hpp b/include/aluminum/ht_impl.hpp
index c09f2926..a08c164f 100644
--- a/include/aluminum/ht_impl.hpp
+++ b/include/aluminum/ht_impl.hpp
@@ -86,14 +86,18 @@ void finalize();
 
 /** Represents a request for the host-transfer backend. */
 struct HostTransferRequest {
-  HostTransferRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_) :
-    op_event(op_event_), orig_stream(orig_stream_) {}
+  HostTransferRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_,
+                      cudaStream_t internal_stream_) :
+    op_event(op_event_), orig_stream(orig_stream_),
+    internal_stream(internal_stream_) {}
   // Note: Not thread safe!
   ~HostTransferRequest() { cuda::release_cuda_event(op_event); }
   /** Event pending on completion of the operation. */
   cudaEvent_t op_event;
   /** Original stream associated with the operation. */
   cudaStream_t orig_stream;
+  /** Internal stream the operation is running on. */
+  cudaStream_t internal_stream;
 };
 
 }  // namespace ht
@@ -799,7 +803,8 @@ class HostTransferBackend {
     cudaStream_t internal_stream, comm_type& comm, req_type& req) {
     cudaEvent_t event = internal::cuda::get_cuda_event();
     AL_CHECK_CUDA(cudaEventRecord(event, internal_stream));
-    req = std::make_shared<internal::ht::HostTransferRequest>(event, comm.get_stream());
+    req = std::make_shared<internal::ht::HostTransferRequest>(
+      event, comm.get_stream(), internal_stream);
   }
 
   /** Run a host-transfer allreduce. */
diff --git a/include/aluminum/nccl_impl.hpp b/include/aluminum/nccl_impl.hpp
index bb8f460f..97106501 100644
--- a/include/aluminum/nccl_impl.hpp
+++ b/include/aluminum/nccl_impl.hpp
@@ -147,14 +147,18 @@ template <> inline ncclDataType_t TypeMap<double>() { return ncclDouble; }
 
 /** Represents a request for the NCCL backend. */
 struct NCCLRequest {
-  NCCLRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_) :
-    op_event(op_event_), orig_stream(orig_stream_) {}
+  NCCLRequest(cudaEvent_t op_event_, cudaStream_t orig_stream_,
+              cudaStream_t internal_stream_) :
+    op_event(op_event_), orig_stream(orig_stream_),
+    internal_stream(internal_stream_) {}
   // Note: Not thread safe!
   ~NCCLRequest() { cuda::release_cuda_event(op_event); }
   /** Event pending on completion of the operation. */
   cudaEvent_t op_event;
   /** Original stream associated with the operation. */
   cudaStream_t orig_stream;
+  /** Internal stream the operation is running on. */
+  cudaStream_t internal_stream;
 };
 
 /**
@@ -789,7 +793,8 @@ class NCCLBackend {
     cudaStream_t internal_stream, comm_type& comm, req_type& req) {
     cudaEvent_t event = internal::cuda::get_cuda_event();
     AL_CHECK_CUDA(cudaEventRecord(event, internal_stream));
-    req = std::make_shared<internal::nccl::NCCLRequest>(event, comm.get_stream());
+    req = std::make_shared<internal::nccl::NCCLRequest>(
+      event, comm.get_stream(), internal_stream);
   }
 
   // These are thin wrappers around the actual NCCL calls.

From 94e9dbe87e11f2bc5075a5cbff5ae7961bbaa57a Mon Sep 17 00:00:00 2001
From: Nikoli Dryden <ndryden@users.noreply.github.com>
Date: Thu, 4 Mar 2021 12:27:57 -0800
Subject: [PATCH 2/2] Support replacing the internal stream pool.

---
 include/aluminum/cuda.hpp | 11 +++++++++++
 src/cuda.cpp              | 21 +++++++++++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/include/aluminum/cuda.hpp b/include/aluminum/cuda.hpp
index e6323b34..2bab5ed3 100644
--- a/include/aluminum/cuda.hpp
+++ b/include/aluminum/cuda.hpp
@@ -31,6 +31,7 @@
 
 #include <utility>
 #include <sstream>
+#include <functional>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
@@ -149,6 +150,16 @@ void release_cuda_event(cudaEvent_t event);
 cudaStream_t get_internal_stream();
 /** Get a specific internal stream. */
 cudaStream_t get_internal_stream(size_t id);
+/**
+ * Replace the internal stream pool with user-provided streams.
+ *
+ * stream_getter may be called an arbitrary number of times and should
+ * return the streams to use in the pool.
+ *
+ * This is meant to help interface with external applications that
+ * need Aluminum to use their streams for everything.
+ */
+void replace_internal_streams(std::function<cudaStream_t()> stream_getter);
 
 /** Return whether stream memory operations are supported. */
 bool stream_memory_operations_supported();
diff --git a/src/cuda.cpp b/src/cuda.cpp
index 9329fdce..f3cdec10 100644
--- a/src/cuda.cpp
+++ b/src/cuda.cpp
@@ -46,6 +46,8 @@ constexpr int num_internal_streams = 5;
 cudaStream_t internal_streams[num_internal_streams];
 // Whether stream memory operations are supported.
 bool stream_mem_ops_supported = false;
+// Whether we're using external streams (these are not freed).
+bool using_external_streams = false;
 }
 
 void init(int&, char**&) {
@@ -89,8 +91,10 @@ void finalize() {
   for (auto&& event : cuda_events) {
     AL_CHECK_CUDA(cudaEventDestroy(event));
   }
-  for (int i = 0; i < num_internal_streams; ++i) {
-    AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i]));
+  if (!using_external_streams) {
+    for (int i = 0; i < num_internal_streams; ++i) {
+      AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i]));
+    }
   }
 }
 
@@ -120,6 +124,19 @@ cudaStream_t get_internal_stream(size_t id) {
   return internal_streams[id];
 }
 
+void replace_internal_streams(std::function<cudaStream_t()> stream_getter) {
+  // Clean up our streams if needed.
+  if (!using_external_streams) {
+    for (int i = 0; i < num_internal_streams; ++i) {
+      AL_CHECK_CUDA(cudaStreamDestroy(internal_streams[i]));
+    }
+  }
+  for (int i = 0; i < num_internal_streams; ++i) {
+    internal_streams[i] = stream_getter();
+  }
+  using_external_streams = true;
+}
+
 bool stream_memory_operations_supported() {
   return stream_mem_ops_supported;
 }