From 57ca7e8879e2d1ce73d57232d81af883722eea85 Mon Sep 17 00:00:00 2001 From: Gerlof Fokkema Date: Fri, 14 Jan 2022 17:38:43 +0100 Subject: [PATCH 1/3] Add support for CUDA Graphs. --- src/cpp/cuda.hpp | 61 ++++++++++++++++++++++++++++++++++++ src/wrapper/wrap_cudadrv.cpp | 35 +++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp index 21cb219c..147f9c1a 100644 --- a/src/cpp/cuda.hpp +++ b/src/cpp/cuda.hpp @@ -990,6 +990,7 @@ namespace pycuda // {{{ stream class event; + class graph; class stream : public boost::noncopyable, public context_dependent { @@ -1022,6 +1023,10 @@ namespace pycuda #if CUDAPP_CUDA_VERSION >= 3020 void wait_for_event(const event &evt); #endif +#if CUDAPP_CUDA_VERSION >= 10000 + void begin_capture(CUstreamCaptureMode mode); + graph *end_capture(); +#endif bool is_done() const { @@ -1042,6 +1047,62 @@ namespace pycuda // }}} + // {{{ graph +#if CUDAPP_CUDA_VERSION >= 10000 + class graph_exec : public boost::noncopyable, public context_dependent + { + private: + CUgraphExec m_exec; + + public: + graph_exec(CUgraphExec exec) + : m_exec(exec) + { } + + void launch(py::object stream_py) + { + PYCUDA_PARSE_STREAM_PY; + CUDAPP_CALL_GUARDED(cuGraphLaunch, (m_exec, s_handle)) + } + }; + + class graph : public boost::noncopyable, public context_dependent + { + private: + CUgraph m_graph; + + public: + graph(CUgraph graph) + : m_graph(graph) + { } + + graph_exec *instance() + { + CUgraphExec instance; + CUDAPP_CALL_GUARDED(cuGraphInstantiate, (&instance, m_graph, NULL, NULL, 0)) + return new graph_exec(instance); + } + + void debug_dot_print(std::string path) + { + CUDAPP_CALL_GUARDED(cuGraphDebugDotPrint, (m_graph, path.c_str(), 0)) + } + }; + + inline void stream::begin_capture(CUstreamCaptureMode mode = CU_STREAM_CAPTURE_MODE_GLOBAL) + { + CUDAPP_CALL_GUARDED(cuStreamBeginCapture, (m_stream, mode)); + } + + inline graph *stream::end_capture() + { + CUgraph result; + CUDAPP_CALL_GUARDED(cuStreamEndCapture, (m_stream, &result)) + return new graph(result); + } +#endif + // }}} + // {{{ array class array : public boost::noncopyable, public context_dependent { diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 6d95edad..054f85a9 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -1196,6 +1196,13 @@ BOOST_PYTHON_MODULE(_driver) // }}} // {{{ stream +#if CUDAPP_CUDA_VERSION >= 10000 + py::enum_("capture_mode") + .value("GLOBAL", CU_STREAM_CAPTURE_MODE_GLOBAL) + .value("THREAD_LOCAL", CU_STREAM_CAPTURE_MODE_THREAD_LOCAL) + .value("RELAXED", CU_STREAM_CAPTURE_MODE_RELAXED) + ; +#endif { typedef stream cl; py::class_ > @@ -1204,12 +1211,40 @@ BOOST_PYTHON_MODULE(_driver) .DEF_SIMPLE_METHOD(is_done) #if CUDAPP_CUDA_VERSION >= 3020 .DEF_SIMPLE_METHOD(wait_for_event) +#endif +#if CUDAPP_CUDA_VERSION >= 10000 + .def("begin_capture", &cl::begin_capture, + py::arg("capture_mode") = CU_STREAM_CAPTURE_MODE_GLOBAL) + .def("end_capture", &cl::end_capture, + py::return_value_policy()) #endif .add_property("handle", &cl::handle_int) ; } // }}} + // {{{ graph +#if CUDAPP_CUDA_VERSION >= 10000 + ; + { + typedef graph_exec cl; + py::class_("GraphExec", py::no_init) + .def("launch", &cl::launch, + py::arg("stream")=py::object()) + ; + } + + { + typedef graph cl; + py::class_("Graph", py::no_init) + .def("instance", &cl::instance, + py::return_value_policy()) + .DEF_SIMPLE_METHOD(debug_dot_print) + ; + } +#endif + // }}} + // {{{ module { typedef module cl; From 0d9515a212ff4f7a8d1f888c045435dcf8199b65 Mon Sep 17 00:00:00 2001 From: Gerlof Fokkema Date: Sat, 15 Jan 2022 13:15:57 +0100 Subject: [PATCH 2/3] Add example using new pycuda graph api. --- examples/demo_graph.py | 66 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 examples/demo_graph.py diff --git a/examples/demo_graph.py b/examples/demo_graph.py new file mode 100644 index 00000000..03284dc5 --- /dev/null +++ b/examples/demo_graph.py @@ -0,0 +1,66 @@ +# Sample source code from the Tutorial Introduction in the documentation. +import pycuda.driver as cuda +import pycuda.autoinit # noqa +from pycuda.compiler import SourceModule + +mod = SourceModule(""" + __global__ void plus(float *a, int num) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] += num; + } + + __global__ void times(float *a, float *b) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] *= b[idx]; + } + """) +func_plus = mod.get_function("plus") +func_times = mod.get_function("times") + +import numpy +a = numpy.zeros((4, 4)).astype(numpy.float32) +a_gpu = cuda.mem_alloc_like(a) +b = numpy.zeros((4, 4)).astype(numpy.float32) +b_gpu = cuda.mem_alloc_like(b) +result = numpy.zeros_like(b) + +# begin graph capture, pull stream_2 into it as a dependency +# https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cross-stream-dependencies +stream_1 = cuda.Stream() +stream_2 = cuda.Stream() +stream_1.begin_capture() +event_init = cuda.Event() +event_a = cuda.Event() +event_b = cuda.Event() + +event_init.record(stream_1) +stream_2.wait_for_event(event_init) + +cuda.memcpy_htod_async(a_gpu, a, stream_1) +func_plus(a_gpu, numpy.int32(2), block=(4, 4, 1), stream=stream_1) +event_a.record(stream_1) + +cuda.memcpy_htod_async(b_gpu, b, stream_2) +func_plus(b_gpu, numpy.int32(3), block=(4, 4, 1), stream=stream_2) +event_b.record(stream_2) + +stream_1.wait_for_event(event_a) +stream_1.wait_for_event(event_b) +func_times(a_gpu, b_gpu, block=(4, 4, 1), stream=stream_1) +cuda.memcpy_dtoh_async(result, a_gpu, stream_1) + +graph = stream_1.end_capture() +graph.debug_dot_print("test.dot") # print dotfile of graph +instance = graph.instance() + +# using a separate graph stream to launch, this is not strictly necessary +stream_graph = cuda.Stream() +instance.launch(stream_graph) + +print("original arrays:") +print(a) +print(b) +print("(0+2)x(0+3) = 6, using a kernel graph of 3 kernels:") +print(result) From 6f9297d3ab7bc9b6c3e0672e526cd7ea546fc1df Mon Sep 17 00:00:00 2001 From: Gerlof Fokkema Date: Sat, 15 Jan 2022 15:07:13 +0100 Subject: [PATCH 3/3] Catch exceptions by const ref per the cpp guidelines. Reduces the amount of compiler warnings significantly. --- src/cpp/cuda.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp index 147f9c1a..7ef18746 100644 --- a/src/cpp/cuda.hpp +++ b/src/cpp/cuda.hpp @@ -159,9 +159,9 @@ typedef Py_ssize_t PYCUDA_BUFFER_SIZE_T; << std::endl; \ } #define CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(TYPE) \ - catch (pycuda::cannot_activate_out_of_thread_context) \ + catch (pycuda::cannot_activate_out_of_thread_context const&) \ { } \ - catch (pycuda::cannot_activate_dead_context) \ + catch (pycuda::cannot_activate_dead_context const&) \ { \ /* PyErr_Warn( \ PyExc_UserWarning, #TYPE " in dead context was implicitly cleaned up");*/ \