-
Notifications
You must be signed in to change notification settings - Fork 493
UCP/DEVICE: Make memh and local_addr optional for counter elements #10945
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
06c380a
a494c52
60a1131
6a3eb78
205b01e
589b0b4
4573621
b0eff73
10646cd
534057b
dc11fd6
c718cec
2288f1b
b1f2ad0
5c3b023
06a662e
777d06a
62e6732
8219c47
025a1a0
e46a302
c70739c
af75f5c
c967767
c444251
a07231c
1f0d285
0d30594
e9c1382
2bbfa15
438346f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,13 +31,17 @@ BEGIN_C_DECLS | |
| * The enumeration allows specifying which fields in @ref | ||
| * ucp_device_mem_list_elem are present. | ||
| * | ||
| * @note Counter elements can omit the @a UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH | ||
| * and @a UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR fields. | ||
| * Data elements must have @a UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH field. | ||
michal-shalev marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| * | ||
| * It is used to enable backward compatibility support. | ||
| */ | ||
| enum ucp_device_mem_list_elem_field { | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH = UCS_BIT(0), /**< Source memory handle */ | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY = UCS_BIT(1), /**< Unpacked remote memory key */ | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY = UCS_BIT(1), /**< Unpacked remote memory key (always required) */ | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR = UCS_BIT(2), /**< Local address */ | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR = UCS_BIT(3), /**< Remote address */ | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR = UCS_BIT(3), /**< Remote address */ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw, it is also always required, right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not always, check remote_offset in partial, users can to pass null to remote address and then the addr aa remote_offset |
||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH = UCS_BIT(4) /**< Length of the local buffer in bytes */ | ||
| }; | ||
|
|
||
|
|
@@ -48,6 +52,9 @@ enum ucp_device_mem_list_elem_field { | |
| * | ||
| * This describes a pair of local and remote memory for which a memory operation | ||
| * can later be performed multiple times, possibly with varying memory offsets. | ||
| * | ||
| * @note Counter elements can omit the @a memh and @a local_addr fields. | ||
| * Data elements must have @a memh field. | ||
michal-shalev marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| */ | ||
| typedef struct ucp_device_mem_list_elem { | ||
| /** | ||
|
|
@@ -80,6 +87,7 @@ typedef struct ucp_device_mem_list_elem { | |
|
|
||
| /** | ||
| * Unpacked memory key for the remote memory endpoint. | ||
| * Always required. | ||
| */ | ||
| ucp_rkey_h rkey; | ||
| } ucp_device_mem_list_elem_t; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,9 @@ KHASH_IMPL(ucp_device_handle_allocs, ucp_device_mem_list_handle_h, | |
| static khash_t(ucp_device_handle_allocs) ucp_device_handle_hash; | ||
| static ucs_spinlock_t ucp_device_handle_hash_lock; | ||
|
|
||
| /* Size of temporary allocation for local sys_dev detection */ | ||
| #define UCP_DEVICE_LOCAL_SYS_DEV_DETECT_SIZE 64 | ||
|
|
||
|
|
||
| void ucp_device_init(void) | ||
| { | ||
|
|
@@ -87,7 +90,65 @@ ucp_device_mem_handle_hash_remove(ucp_device_mem_list_handle_h handle) | |
| } | ||
|
|
||
| static ucs_status_t | ||
| ucp_device_mem_list_params_check(const ucp_device_mem_list_params_t *params, | ||
| ucp_device_detect_local_sys_dev(ucp_context_h context, | ||
| ucs_sys_device_t *local_sys_dev_p) | ||
| { | ||
| ucs_memory_info_t mem_info; | ||
| uct_allocated_memory_t detect_mem; | ||
| ucs_status_t status; | ||
|
|
||
| status = ucp_mem_do_alloc(context, NULL, | ||
| UCP_DEVICE_LOCAL_SYS_DEV_DETECT_SIZE, | ||
| UCT_MD_MEM_ACCESS_LOCAL_READ | | ||
| UCT_MD_MEM_ACCESS_LOCAL_WRITE, | ||
| UCS_MEMORY_TYPE_CUDA, UCS_SYS_DEVICE_ID_UNKNOWN, | ||
| "local_sys_dev_detect", &detect_mem); | ||
| if (status != UCS_OK) { | ||
| ucs_error("failed to allocate memory for sys_dev detection: %s", | ||
| ucs_status_string(status)); | ||
| return status; | ||
| } | ||
|
|
||
| ucp_memory_detect_internal(context, detect_mem.address, detect_mem.length, | ||
| &mem_info); | ||
| *local_sys_dev_p = mem_info.sys_dev; | ||
rakhmets marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| uct_mem_free(&detect_mem); | ||
|
|
||
| if (*local_sys_dev_p == UCS_SYS_DEVICE_ID_UNKNOWN) { | ||
| ucs_error("detected unknown local_sys_dev"); | ||
| return UCS_ERR_UNSUPPORTED; | ||
| } | ||
|
|
||
| ucs_trace("detected local_sys_dev=%u", *local_sys_dev_p); | ||
| return UCS_OK; | ||
| } | ||
|
|
||
| static ucp_md_map_t | ||
| ucp_device_detect_local_md_map(ucp_context_h context, | ||
rakhmets marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ucs_sys_device_t local_sys_dev) | ||
| { | ||
| ucp_md_map_t local_md_map = 0; | ||
| ucp_md_index_t md_index; | ||
|
|
||
| /* Build MD map from MDs that can access the local_sys_dev */ | ||
| for (md_index = 0; md_index < context->num_mds; md_index++) { | ||
| ucp_sys_dev_map_t sys_dev_map = context->tl_mds[md_index].sys_dev_map; | ||
|
|
||
| if (sys_dev_map & UCS_BIT(local_sys_dev)) { | ||
| local_md_map |= UCS_BIT(md_index); | ||
| } | ||
| } | ||
|
|
||
| ucs_trace("detected local_md_map=0x%" PRIx64 " for local_sys_dev=%u", | ||
| local_md_map, local_sys_dev); | ||
| return local_md_map; | ||
| } | ||
|
|
||
|
|
||
| static ucs_status_t | ||
| ucp_device_mem_list_params_check(ucp_context_h context, | ||
| const ucp_device_mem_list_params_t *params, | ||
| ucp_worker_cfg_index_t *rkey_cfg_index, | ||
| ucs_sys_device_t *local_sys_dev, | ||
| ucp_md_map_t *local_md_map, | ||
|
|
@@ -97,6 +158,7 @@ ucp_device_mem_list_params_check(const ucp_device_mem_list_params_t *params, | |
| ucp_mem_h memh; | ||
| size_t i, num_elements, element_size; | ||
| const ucp_device_mem_list_elem_t *elements, *element; | ||
| ucs_status_t status; | ||
|
|
||
| if (params == NULL) { | ||
| return UCS_ERR_INVALID_PARAM; | ||
|
|
@@ -121,38 +183,61 @@ ucp_device_mem_list_params_check(const ucp_device_mem_list_params_t *params, | |
| RKEY, NULL); | ||
|
|
||
| /* TODO: Delegate most of checks below to proto selection */ | ||
| if ((rkey == NULL) || (memh == NULL)) { | ||
| ucs_error("element[%lu] rkey=%p, memh=%p", i, rkey, memh); | ||
| if (rkey == NULL) { | ||
| ucs_error("element[%lu] rkey is NULL", i); | ||
| return UCS_ERR_INVALID_PARAM; | ||
| } | ||
|
|
||
| if (i == 0) { | ||
| *local_sys_dev = memh->sys_dev; | ||
| *local_md_map = memh->md_map; | ||
| *mem_type = memh->mem_type; | ||
| if (memh != NULL) { | ||
| *local_sys_dev = memh->sys_dev; | ||
| *local_md_map = memh->md_map; | ||
| *mem_type = memh->mem_type; | ||
rakhmets marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| *rkey_cfg_index = rkey->cfg_index; | ||
| if (*rkey_cfg_index == UCP_WORKER_CFG_INDEX_NULL) { | ||
| ucs_debug("invalid first rkey: cfg_index=%d", *rkey_cfg_index); | ||
| return UCS_ERR_INVALID_PARAM; | ||
| } | ||
| } else { | ||
| *local_md_map &= memh->md_map; | ||
| if (rkey->cfg_index != *rkey_cfg_index) { | ||
| ucs_debug("mismatched rkey config index: " | ||
| "ucp_rkey[%lu]->cfg_index=%u cfg_index=%u", | ||
| i, rkey->cfg_index, *rkey_cfg_index); | ||
| return UCS_ERR_UNSUPPORTED; | ||
| } | ||
|
|
||
| if (memh->sys_dev != *local_sys_dev) { | ||
| ucs_debug("mismatched local sys_dev: ucp_memh[%zu].sys_dev=%u " | ||
| "first_sys_dev=%u", | ||
| i, memh->sys_dev, *local_sys_dev); | ||
| return UCS_ERR_UNSUPPORTED; | ||
| if (memh != NULL) { | ||
| if (*local_sys_dev == UCS_SYS_DEVICE_ID_UNKNOWN) { | ||
| *local_sys_dev = memh->sys_dev; | ||
| *local_md_map = memh->md_map; | ||
ofirfarjun7 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| *mem_type = memh->mem_type; | ||
| } else { | ||
| *local_md_map &= memh->md_map; | ||
| if (memh->sys_dev != *local_sys_dev) { | ||
ofirfarjun7 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ucs_debug("mismatched local sys_dev: ucp_memh[%zu].sys_dev=%u " | ||
| "first_sys_dev=%u", | ||
| i, memh->sys_dev, *local_sys_dev); | ||
| return UCS_ERR_UNSUPPORTED; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /* No memh provided */ | ||
| if (*local_sys_dev == UCS_SYS_DEVICE_ID_UNKNOWN) { | ||
| status = ucp_device_detect_local_sys_dev(context, local_sys_dev); | ||
| if (status != UCS_OK) { | ||
| return status; | ||
| } | ||
|
|
||
| *local_md_map = ucp_device_detect_local_md_map(context, | ||
ofirfarjun7 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| *local_sys_dev); | ||
| *mem_type = UCS_MEMORY_TYPE_CUDA; | ||
ofirfarjun7 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| return UCS_OK; | ||
| } | ||
|
|
||
|
|
@@ -361,13 +446,17 @@ static ucs_status_t ucp_device_mem_list_create_handle( | |
| ucp_ep_get_rsc_index(ep, lanes[i])); | ||
| ucp_element = params->elements; | ||
| for (j = 0; j < params->num_elements; j++) { | ||
| /* Local registration */ | ||
| uct_memh = ucp_element->memh->uct[local_md_index]; | ||
| ucs_assertv((ucp_element->memh->md_map & UCS_BIT(local_md_index)) != | ||
| 0, | ||
| "uct_memh=%p md_map=0x%lx local_md_index=%u", uct_memh, | ||
| ucp_element->memh->md_map, local_md_index); | ||
| ucs_assert(uct_memh != UCT_MEM_HANDLE_NULL); | ||
| if (ucp_element->memh != NULL) { | ||
|
||
| /* Local registration */ | ||
| uct_memh = ucp_element->memh->uct[local_md_index]; | ||
| ucs_assertv( | ||
| (ucp_element->memh->md_map & UCS_BIT(local_md_index)) != 0, | ||
| "uct_memh=%p md_map=0x%lx local_md_index=%u", uct_memh, | ||
| ucp_element->memh->md_map, local_md_index); | ||
| ucs_assert(uct_memh != UCT_MEM_HANDLE_NULL); | ||
| } else { | ||
| uct_memh = UCT_MEM_HANDLE_NULL; | ||
| } | ||
|
|
||
| /* Remote registration */ | ||
| rkey_index = | ||
|
|
@@ -423,9 +512,9 @@ ucp_device_mem_list_create(ucp_ep_h ep, | |
| } | ||
|
|
||
| /* Parameter sanity checks and extraction */ | ||
| status = ucp_device_mem_list_params_check(params, &rkey_cfg_index, | ||
| &local_sys_dev, &local_md_map, | ||
| &mem_type); | ||
| status = ucp_device_mem_list_params_check(ep->worker->context, params, | ||
| &rkey_cfg_index, &local_sys_dev, | ||
| &local_md_map, &mem_type); | ||
| if (status != UCS_OK) { | ||
| return status; | ||
| } | ||
|
|
@@ -524,3 +613,4 @@ uint64_t ucp_device_counter_read(ucp_worker_h worker, | |
| sizeof(counter_value), mem_type); | ||
| return counter_value; | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.