Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/CUDA_COPY: Fix ep_flush logic; Remove sync_streams usage #10493

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions src/uct/cuda/cuda_copy/cuda_copy_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,6 @@ uct_cuda_copy_post_cuda_async_copy(uct_ep_h tl_ep, void *dst, void *src,
ucs_queue_push(event_q, &cuda_event->queue);
cuda_event->comp = comp;

UCS_STATIC_BITMAP_SET(&iface->streams_to_sync,
uct_cuda_copy_flush_bitmap_idx(src_type, dst_type));

ucs_trace("cuda async issued: %p dst:%p[%s], src:%p[%s] len:%ld",
cuda_event, dst, ucs_memory_type_names[dst_type], src,
ucs_memory_type_names[src_type], length);
Expand Down
46 changes: 7 additions & 39 deletions src/uct/cuda/cuda_copy/cuda_copy_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,51 +141,17 @@ static ucs_status_t uct_cuda_copy_iface_query(uct_iface_h tl_iface,
return UCS_OK;
}

static ucs_status_t uct_cuda_copy_sync_streams(uct_cuda_copy_iface_t *iface)
{
CUstream stream;
uint32_t stream_index;
ucs_memory_type_t src_mem_type, dst_mem_type;
ucs_status_t status;

UCS_STATIC_BITMAP_FOR_EACH_BIT(stream_index, &iface->streams_to_sync) {
src_mem_type = stream_index / UCS_MEMORY_TYPE_LAST;
if ((src_mem_type >= UCS_MEMORY_TYPE_LAST)) {
break;
}

dst_mem_type = stream_index % UCS_MEMORY_TYPE_LAST;
stream = iface->queue_desc[src_mem_type][dst_mem_type].stream;
status = UCT_CUDADRV_FUNC_LOG_ERR(cuStreamSynchronize(stream));
if (status != UCS_OK) {
return status;
}

UCS_STATIC_BITMAP_RESET(&iface->streams_to_sync,
uct_cuda_copy_flush_bitmap_idx(src_mem_type,
dst_mem_type));
}

return UCS_OK;
}

static ucs_status_t uct_cuda_copy_iface_flush(uct_iface_h tl_iface, unsigned flags,
uct_completion_t *comp)
{
uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_copy_iface_t);
uct_cuda_copy_queue_desc_t *q_desc;
ucs_queue_iter_t iter;
ucs_status_t status;

if (comp != NULL) {
return UCS_ERR_UNSUPPORTED;
}

status = uct_cuda_copy_sync_streams(iface);
if (status != UCS_OK) {
return status;
}

ucs_queue_for_each_safe(q_desc, iter, &iface->active_queue, queue) {
if (!ucs_queue_is_empty(&q_desc->event_queue)) {
UCT_TL_IFACE_STAT_FLUSH_WAIT(ucs_derived_of(tl_iface,
Expand Down Expand Up @@ -318,11 +284,14 @@ uct_cuda_copy_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp)
{
uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface,
uct_cuda_copy_iface_t);
ucs_status_t status;
uct_cuda_copy_queue_desc_t *q_desc;
ucs_queue_iter_t iter;

status = uct_cuda_copy_sync_streams(iface);
if (status != UCS_OK) {
return status;
ucs_queue_for_each_safe(q_desc, iter, &iface->active_queue, queue) {
if (!ucs_queue_is_empty(&q_desc->event_queue)) {
UCT_TL_EP_STAT_FLUSH_WAIT(ucs_derived_of(tl_ep, uct_base_ep_t));
return UCS_INPROGRESS;
}
}

return uct_base_ep_flush(tl_ep, flags, comp);
Expand Down Expand Up @@ -492,7 +461,6 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work
self->config.max_poll = config->max_poll;
self->config.max_cuda_events = config->max_cuda_events;
self->config.bandwidth = config->bandwidth;
UCS_STATIC_BITMAP_RESET_ALL(&self->streams_to_sync);

ucs_mpool_params_reset(&mp_params);
mp_params.elem_size = sizeof(uct_cuda_copy_event_desc_t);
Expand Down
31 changes: 0 additions & 31 deletions src/uct/cuda/cuda_copy/cuda_copy_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,6 @@
typedef uint64_t uct_cuda_copy_iface_addr_t;


/*
uct_cu_stream_bitmap_t will be treated as a 2D bitmap, in which
each bit represents a CUstream from the queue_desc attr:
row index is source mem_type and column index is the dest mem_type.

For example:
H - Host, C - Cuda, R - ROCm, I - Infiniband (RDMA)

H C R I
H 0 0 0 0
C 0 0 0 0
R 0 0 0 0
I 0 0 0 0

Bits will be set using:
UCS_BITMAP_SET(bitmap, uct_cuda_copy_flush_bitmap_idx(src_mem_type, dst_mem_type))
*/
typedef ucs_static_bitmap_s(UCT_CUDA_MEMORY_TYPES_MAP) uct_cu_stream_bitmap_t;

typedef struct uct_cuda_copy_queue_desc {
/* stream on which asynchronous memcpy operations are enqueued */
CUstream stream;
Expand Down Expand Up @@ -76,10 +57,6 @@ typedef struct uct_cuda_copy_iface {
void *event_arg;
uct_async_event_cb_t event_cb;
} async;

/* 2D bitmap representing which streams in queue_desc matrix
should sync during flush */
uct_cu_stream_bitmap_t streams_to_sync;
} uct_cuda_copy_iface_t;


Expand All @@ -97,12 +74,4 @@ typedef struct uct_cuda_copy_event_desc {
ucs_queue_elem_t queue;
} uct_cuda_copy_event_desc_t;


static UCS_F_ALWAYS_INLINE unsigned
uct_cuda_copy_flush_bitmap_idx(ucs_memory_type_t src_mem_type,
ucs_memory_type_t dst_mem_type)
{
return (src_mem_type * UCS_MEMORY_TYPE_LAST) + dst_mem_type;
}

#endif
Loading