From 57ca7e8879e2d1ce73d57232d81af883722eea85 Mon Sep 17 00:00:00 2001
From: Gerlof Fokkema <gerlof.fokkema@gmail.com>
Date: Fri, 14 Jan 2022 17:38:43 +0100
Subject: [PATCH 1/3] Add support for CUDA Graphs.

---
 src/cpp/cuda.hpp             | 61 ++++++++++++++++++++++++++++++++++++
 src/wrapper/wrap_cudadrv.cpp | 35 +++++++++++++++++++++
 2 files changed, 96 insertions(+)
diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp
index 21cb219c..147f9c1a 100644
--- a/src/cpp/cuda.hpp
+++ b/src/cpp/cuda.hpp
@@ -990,6 +990,7 @@ namespace pycuda
 
   // {{{ stream
   class event;
+  class graph;
 
   class stream : public boost::noncopyable, public context_dependent
   {
@@ -1022,6 +1023,10 @@ namespace pycuda
 #if CUDAPP_CUDA_VERSION >= 3020
       void wait_for_event(const event &evt);
 #endif
+#if CUDAPP_CUDA_VERSION >= 10000
+      void begin_capture(CUstreamCaptureMode mode);
+      graph *end_capture();
+#endif
 
       bool is_done() const
       {
@@ -1042,6 +1047,62 @@ namespace pycuda
 
   // }}}
 
+  // {{{ graph
+#if CUDAPP_CUDA_VERSION >= 10000
+  class graph_exec : public boost::noncopyable, public context_dependent
+  {
+    private:
+      CUgraphExec m_exec;
+
+    public:
+      graph_exec(CUgraphExec exec)
+        : m_exec(exec)
+      { }
+
+      void launch(py::object stream_py)
+      {
+        PYCUDA_PARSE_STREAM_PY;
+        CUDAPP_CALL_GUARDED(cuGraphLaunch, (m_exec, s_handle))
+      }
+  };
+
+  class graph : public boost::noncopyable, public context_dependent
+  {
+    private:
+      CUgraph m_graph;
+
+    public:
+      graph(CUgraph graph)
+        : m_graph(graph)
+      { }
+
+      graph_exec *instance()
+      {
+        CUgraphExec instance;
+        CUDAPP_CALL_GUARDED(cuGraphInstantiate, (&instance, m_graph, NULL, NULL, 0))
+        return new graph_exec(instance);
+      }
+
+      void debug_dot_print(std::string path)
+      {
+        CUDAPP_CALL_GUARDED(cuGraphDebugDotPrint, (m_graph, path.c_str(), 0))
+      }
+  };
+
+  inline void stream::begin_capture(CUstreamCaptureMode mode = CU_STREAM_CAPTURE_MODE_GLOBAL)
+  {
+    CUDAPP_CALL_GUARDED(cuStreamBeginCapture, (m_stream, mode));
+  }
+
+  inline graph *stream::end_capture()
+  {
+    CUgraph result;
+    CUDAPP_CALL_GUARDED(cuStreamEndCapture, (m_stream, &result))
+    return new graph(result);
+  }
+#endif
+  // }}}
+
   // {{{ array
   class array : public boost::noncopyable, public context_dependent
   {
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 6d95edad..054f85a9 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -1196,6 +1196,13 @@ BOOST_PYTHON_MODULE(_driver)
   // }}}
 
   // {{{ stream
+#if CUDAPP_CUDA_VERSION >= 10000
+  py::enum_<CUstreamCaptureMode>("capture_mode")
+    .value("GLOBAL", CU_STREAM_CAPTURE_MODE_GLOBAL)
+    .value("THREAD_LOCAL", CU_STREAM_CAPTURE_MODE_THREAD_LOCAL)
+    .value("RELAXED", CU_STREAM_CAPTURE_MODE_RELAXED)
+    ;
+#endif
   {
     typedef stream cl;
     py::class_<cl, boost::noncopyable, shared_ptr<cl> >
@@ -1204,12 +1211,40 @@ BOOST_PYTHON_MODULE(_driver)
       .DEF_SIMPLE_METHOD(is_done)
 #if CUDAPP_CUDA_VERSION >= 3020
       .DEF_SIMPLE_METHOD(wait_for_event)
+#endif
+#if CUDAPP_CUDA_VERSION >= 10000
+      .def("begin_capture", &cl::begin_capture,
+        py::arg("capture_mode") = CU_STREAM_CAPTURE_MODE_GLOBAL)
+      .def("end_capture", &cl::end_capture,
+        py::return_value_policy<py::manage_new_object>())
 #endif
       .add_property("handle", &cl::handle_int)
       ;
   }
   // }}}
 
+  // {{{ graph
+#if CUDAPP_CUDA_VERSION >= 10000
+    ;
+  {
+    typedef graph_exec cl;
+    py::class_<cl, boost::noncopyable>("GraphExec", py::no_init)
+      .def("launch", &cl::launch,
+        py::arg("stream")=py::object())
+        ;
+  }
+
+  {
+    typedef graph cl;
+    py::class_<cl, boost::noncopyable>("Graph", py::no_init)
+      .def("instance", &cl::instance,
+          py::return_value_policy<py::manage_new_object>())
+      .DEF_SIMPLE_METHOD(debug_dot_print)
+      ;
+  }
+#endif
+  // }}}
+
   // {{{ module
   {
     typedef module cl;

From 0d9515a212ff4f7a8d1f888c045435dcf8199b65 Mon Sep 17 00:00:00 2001
From: Gerlof Fokkema <gerlof.fokkema@gmail.com>
Date: Sat, 15 Jan 2022 13:15:57 +0100
Subject: [PATCH 2/3] Add example using new pycuda graph api.

---
 examples/demo_graph.py | 66 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 examples/demo_graph.py

diff --git a/examples/demo_graph.py b/examples/demo_graph.py
new file mode 100644
index 00000000..03284dc5
--- /dev/null
+++ b/examples/demo_graph.py
@@ -0,0 +1,66 @@
+# Sample source code from the Tutorial Introduction in the documentation.
+import pycuda.driver as cuda
+import pycuda.autoinit  # noqa
+from pycuda.compiler import SourceModule
+
+mod = SourceModule("""
+    __global__ void plus(float *a, int num)
+    {
+      int idx = threadIdx.x + threadIdx.y*4;
+      a[idx] += num;
+    }
+
+    __global__ void times(float *a, float *b)
+    {
+      int idx = threadIdx.x + threadIdx.y*4;
+      a[idx] *= b[idx];
+    }
+    """)
+func_plus = mod.get_function("plus")
+func_times = mod.get_function("times")
+
+import numpy
+a = numpy.zeros((4, 4)).astype(numpy.float32)
+a_gpu = cuda.mem_alloc_like(a)
+b = numpy.zeros((4, 4)).astype(numpy.float32)
+b_gpu = cuda.mem_alloc_like(b)
+result = numpy.zeros_like(b)
+
+# begin graph capture, pull stream_2 into it as a dependency
+#  https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cross-stream-dependencies
+stream_1 = cuda.Stream()
+stream_2 = cuda.Stream()
+stream_1.begin_capture()
+event_init = cuda.Event()
+event_a = cuda.Event()
+event_b = cuda.Event()
+
+event_init.record(stream_1)
+stream_2.wait_for_event(event_init)
+
+cuda.memcpy_htod_async(a_gpu, a, stream_1)
+func_plus(a_gpu, numpy.int32(2), block=(4, 4, 1), stream=stream_1)
+event_a.record(stream_1)
+
+cuda.memcpy_htod_async(b_gpu, b, stream_2)
+func_plus(b_gpu, numpy.int32(3), block=(4, 4, 1), stream=stream_2)
+event_b.record(stream_2)
+
+stream_1.wait_for_event(event_a)
+stream_1.wait_for_event(event_b)
+func_times(a_gpu, b_gpu, block=(4, 4, 1), stream=stream_1)
+cuda.memcpy_dtoh_async(result, a_gpu, stream_1)
+
+graph = stream_1.end_capture()
+graph.debug_dot_print("test.dot")  # print dotfile of graph
+instance = graph.instance()
+
+# using a separate graph stream to launch, this is not strictly necessary
+stream_graph = cuda.Stream()
+instance.launch(stream_graph)
+
+print("original arrays:")
+print(a)
+print(b)
+print("(0+2)x(0+3) = 6, using a kernel graph of 3 kernels:")
+print(result)

From 6f9297d3ab7bc9b6c3e0672e526cd7ea546fc1df Mon Sep 17 00:00:00 2001
From: Gerlof Fokkema <gerlof.fokkema@gmail.com>
Date: Sat, 15 Jan 2022 15:07:13 +0100
Subject: [PATCH 3/3] Catch exceptions by const ref per the cpp guidelines.

Reduces the amount of compiler warnings significantly.
---
 src/cpp/cuda.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp
index 147f9c1a..7ef18746 100644
--- a/src/cpp/cuda.hpp
+++ b/src/cpp/cuda.hpp
@@ -159,9 +159,9 @@ typedef Py_ssize_t PYCUDA_BUFFER_SIZE_T;
         << std::endl; \
   }
 #define CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(TYPE) \
-  catch (pycuda::cannot_activate_out_of_thread_context) \
+  catch (pycuda::cannot_activate_out_of_thread_context const&) \
   { } \
-  catch (pycuda::cannot_activate_dead_context) \
+  catch (pycuda::cannot_activate_dead_context const&) \
   { \
     /* PyErr_Warn( \
         PyExc_UserWarning, #TYPE " in dead context was implicitly cleaned up");*/ \