Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement stream priority feature #321

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions doc/driver.rst
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ Constants
CUDA 6.0 and above.

.. versionadded:: 2014.1

.. attribute :: HOST_NATIVE_ATOMIC_SUPPORTED
SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
PAGEABLE_MEMORY_ACCESS
Expand Down Expand Up @@ -644,6 +644,10 @@ Devices and Contexts

See also :mod:`pycuda.autoinit`.

.. function:: get_stream_priority_range()

Returns numerical values that correspond to the least and greatest stream priorities.

.. class:: Device(number)
Device(pci_bus_id)

Expand Down Expand Up @@ -813,7 +817,7 @@ Devices and Contexts
Concurrency and Streams
-----------------------

.. class:: Stream(flags=0)
.. class:: Stream(flags=0, priority=0)

A handle for a queue of operations that will be carried out in order.

Expand Down
26 changes: 23 additions & 3 deletions src/cpp/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,6 @@ namespace pycuda
* to push contexts that are already active at a deeper stack level, so we
* maintain all contexts floating other than the top one.
*/

// for friend decl
namespace gl {
boost::shared_ptr<context>
Expand Down Expand Up @@ -862,6 +861,18 @@ namespace pycuda
return result;
}

#if CUDAPP_CUDA_VERSION >= 7500
inline
py::tuple get_stream_priority_range()
{
int leastPriority;
int greatestPriority;
CUDAPP_CALL_GUARDED(cuCtxGetStreamPriorityRange, (&leastPriority, &greatestPriority));
return py::make_tuple(leastPriority, greatestPriority);
}
#endif



#if CUDAPP_CUDA_VERSION >= 7000
inline boost::shared_ptr<context> device::retain_primary_context()
Expand Down Expand Up @@ -997,8 +1008,17 @@ namespace pycuda
CUstream m_stream;

public:
stream(unsigned int flags=0)
{ CUDAPP_CALL_GUARDED(cuStreamCreate, (&m_stream, flags)); }
dmenig marked this conversation as resolved.
Show resolved Hide resolved
dmenig marked this conversation as resolved.
Show resolved Hide resolved
dmenig marked this conversation as resolved.
Show resolved Hide resolved

#if CUDAPP_CUDA_VERSION >= 7500
stream(unsigned int flags=0, int priority=0)
{ CUDAPP_CALL_GUARDED(cuStreamCreateWithPriority, (&m_stream, flags, priority)); }
#else
if (priority != 0)
throw pycuda::error("stream", CUDA_ERROR_INVALID_HANDLE,
"priority!=0 setting isn't supported for your CUDA version");
stream(unsigned int flags=0)
{ CUDAPP_CALL_GUARDED(cuStreamCreate, (&m_stream, flags)); }
#endif

~stream()
{
Expand Down
5 changes: 4 additions & 1 deletion src/wrapper/wrap_cudadrv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1193,13 +1193,16 @@ BOOST_PYTHON_MODULE(_driver)
.add_property("handle", &cl::handle_int)
;
}

DEF_SIMPLE_FUNCTION(get_stream_priority_range);

// }}}

// {{{ stream
{
typedef stream cl;
py::class_<cl, boost::noncopyable, shared_ptr<cl> >
("Stream", py::init<unsigned int>(py::arg("flags")=0))
("Stream", py::init<unsigned int, int>(py::arg("flags")=0, py::arg("priority")=0))
.DEF_SIMPLE_METHOD(synchronize)
.DEF_SIMPLE_METHOD(is_done)
#if CUDAPP_CUDA_VERSION >= 3020
Expand Down
23 changes: 23 additions & 0 deletions test/test_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,6 +935,29 @@ def test_register_host_memory(self):
drv.memcpy_htod_async(gpu_ary, a_pin, stream)
drv.Context.synchronize()

@mark_cuda_test
def test_stream_priority_setting(self):
if drv.get_version() < (4,):
from py.test import skip

skip("register_host_memory only exists on CUDA 4.0 and later")

import sys

if sys.platform == "darwin":
from py.test import skip

skip("register_host_memory is not supported on OS X")

a = drv.aligned_empty((2 ** 20,), np.float64)
a_pin = drv.register_host_memory(a)

gpu_ary = drv.mem_alloc_like(a)
min_priority, max_priority = drv.get_stream_priority_range()
stream = drv.Stream(priority=np.random.choice(range(min_priority, max_priority)))
drv.memcpy_htod_async(gpu_ary, a_pin, stream)
drv.Context.synchronize()

@mark_cuda_test
# https://github.com/inducer/pycuda/issues/45
def test_recursive_launch(self):
Expand Down