Skip to content
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
06c380a
UCP/DEVICE: Make memh and local_addr optional for counter operations
michal-shalev Oct 11, 2025
a494c52
Merge branch 'master' into optional-memh-localaddr-counter-ops
michal-shalev Oct 13, 2025
60a1131
UCP/DEVICE: PR fixes
michal-shalev Oct 13, 2025
6a3eb78
UCP/DEVICE: PR fixes 2.0
michal-shalev Oct 13, 2025
205b01e
UCP/DEVICE: PR fixes 3.0
michal-shalev Oct 13, 2025
589b0b4
UCP/DEVICE: PR fixes 4.0
michal-shalev Oct 13, 2025
4573621
UCP/DEVICE: PR fixes 5.0
michal-shalev Oct 13, 2025
b0eff73
UCP/DEVICE: Update perftest
michal-shalev Oct 13, 2025
10646cd
UCP/DEVICE: Update perftest 2.0
michal-shalev Oct 13, 2025
534057b
UCP/DEVICE: PR fixes 6.0
michal-shalev Oct 13, 2025
dc11fd6
UCP/DEVICE: CI fix
michal-shalev Oct 13, 2025
c718cec
UCP/DEVICE: PR fixes 7.0
michal-shalev Oct 13, 2025
2288f1b
UCP/DEVICE: PR fixes 8.0
michal-shalev Oct 15, 2025
b1f2ad0
UCP/DEVICE: PR fixes 9.0
michal-shalev Oct 15, 2025
5c3b023
UCP/DEVICE: PR fixes 10.0
michal-shalev Oct 16, 2025
06a662e
UCP/DEVICE: PR fixes 11.0
michal-shalev Oct 20, 2025
777d06a
UCP/DEVICE: PR fixes 12.0
michal-shalev Oct 20, 2025
62e6732
UCP/DEVICE: PR fixes 13.0
michal-shalev Oct 20, 2025
8219c47
UCP/DEVICE: Fix code style
michal-shalev Oct 20, 2025
025a1a0
UCP/DEVICE: PR fixes 14.0
michal-shalev Oct 20, 2025
e46a302
UCP/DEVICE: Fix documentation
michal-shalev Oct 20, 2025
c70739c
UCP/DEVICE: Add ucp_device_detect_uct_memh
michal-shalev Oct 20, 2025
af75f5c
UCP/DEVICE: Add tests
michal-shalev Oct 20, 2025
c967767
UCP/DEVICE: PR fixes 15.0
michal-shalev Oct 22, 2025
c444251
UCP/DEVICE: PR fixes 16.0
michal-shalev Oct 22, 2025
a07231c
Merge branch 'master' into optional-memh-localaddr-counter-ops
michal-shalev Oct 22, 2025
1f0d285
UCP/DEVICE: PR fixes 17.0
michal-shalev Oct 25, 2025
0d30594
UCP/DEVICE: PR fixes 18.0
michal-shalev Oct 25, 2025
e9c1382
UCP/DEVICE: PR fixes 19.0
michal-shalev Oct 26, 2025
2bbfa15
UCP/DEVICE: PR fixes 20.0
michal-shalev Oct 26, 2025
438346f
UCP/DEVICE: PR fixes 21.0
michal-shalev Oct 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 36 additions & 21 deletions src/tools/perf/cuda/ucp_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -109,28 +109,41 @@ public:
const ucp_perf_cuda_params &get_params() const { return m_params; }

private:
static bool has_counter(const ucx_perf_context_t &perf)
{
return (perf.params.command != UCX_PERF_CMD_PUT_SINGLE);
}

void init_mem_list(const ucx_perf_context_t &perf)
{
/* +1 for the counter */
size_t count = perf.params.msg_size_cnt + 1;
size_t offset = 0;
size_t data_count = perf.params.msg_size_cnt;
size_t count = data_count + (has_counter(perf) ? 1 : 0);
size_t offset = 0;
ucp_device_mem_list_elem_t elems[count];

for (size_t i = 0; i < count; ++i) {
elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH;
elems[i].memh = perf.ucp.send_memh;
elems[i].rkey = perf.ucp.rkey;
elems[i].local_addr = UCS_PTR_BYTE_OFFSET(perf.send_buffer, offset);
for (size_t i = 0; i < data_count; ++i) {
elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH;
elems[i].memh = perf.ucp.send_memh;
elems[i].rkey = perf.ucp.rkey;
elems[i].local_addr = UCS_PTR_BYTE_OFFSET(perf.send_buffer, offset);
elems[i].remote_addr = perf.ucp.remote_addr + offset;
elems[i].length = (i == count - 1) ? ONESIDED_SIGNAL_SIZE :
perf.params.msg_size_list[i];
elems[i].length = perf.params.msg_size_list[i];
offset += elems[i].length;
}

if (has_counter(perf)) {
elems[data_count].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH;
elems[data_count].rkey = perf.ucp.rkey;
elems[data_count].remote_addr = perf.ucp.remote_addr + offset;
elems[data_count].length = ONESIDED_SIGNAL_SIZE;
}

ucp_device_mem_list_params_t params;
params.field_mask = UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS |
UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE |
Expand All @@ -148,20 +161,22 @@ private:

void init_elements(const ucx_perf_context_t &perf)
{
/* +1 for the counter */
size_t count = perf.params.msg_size_cnt + 1;
size_t offset = 0;
size_t data_count = perf.params.msg_size_cnt;
size_t count = data_count + (has_counter(perf) ? 1 : 0);

std::vector<unsigned> indices(count);
std::vector<size_t> local_offsets(count, 0);
std::vector<size_t> remote_offsets(count, 0);
std::vector<size_t> lengths(count);

for (unsigned i = 0; i < count; ++i) {
for (unsigned i = 0; i < data_count; ++i) {
indices[i] = i;
lengths[i] = (i == count - 1) ? ONESIDED_SIGNAL_SIZE :
perf.params.msg_size_list[i];
offset += lengths[i];
lengths[i] = perf.params.msg_size_list[i];
}

if (has_counter(perf)) {
indices[data_count] = data_count;
lengths[data_count] = ONESIDED_SIGNAL_SIZE;
}

device_clone(&m_params.indices, indices.data(), count);
Expand Down
10 changes: 6 additions & 4 deletions src/ucp/api/device/ucp_device_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,11 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc(
* This operation can be polled on the receiver to detect completion of all the
* operations of the batch, started during the same routine call.
*
* The last entry in the descriptor list contains
* the remote memory registration descriptors to be used for the increment
* operation.
* All the elements except the last one are data elements that must contain all
* @ref ucp_device_mem_list_elem_fields and @ref ucp_device_mem_list_elem_t.
*
* The last entry in the descriptor list contains the remote memory
* registration descriptors to be used for the increment operation.
*
* The routine returns a request that can be progressed and checked for
* completion with @ref ucp_device_progress_req.
Expand Down Expand Up @@ -410,7 +412,7 @@ UCS_F_DEVICE void ucp_device_counter_write(void *counter_ptr, uint64_t value)
*
* @tparam level Level of cooperation of the transfer.
* @param [in] req Request containing operations in progress and channel to progress.
*
*
* @return UCS_OK - The request has completed, no more operations are
* in progress.
* @return UCS_INPROGRESS - One or more operations in the request batch
Expand Down
13 changes: 11 additions & 2 deletions src/ucp/api/device/ucp_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,18 @@ BEGIN_C_DECLS
* The enumeration allows specifying which fields in @ref
* ucp_device_mem_list_elem are present.
*
* @note Counter elements can omit the @a UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH
* and @a UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR fields.
* Data elements must have either @a UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH
* or @a UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR field.
*
* It is used to enable backward compatibility support.
*/
enum ucp_device_mem_list_elem_field {
UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH = UCS_BIT(0), /**< Source memory handle */
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY = UCS_BIT(1), /**< Unpacked remote memory key */
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY = UCS_BIT(1), /**< Unpacked remote memory key (always required) */
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR = UCS_BIT(2), /**< Local address */
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR = UCS_BIT(3), /**< Remote address */
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR = UCS_BIT(3), /**< Remote address */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw, it is also always required, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not always, check remote_offset in partial, users can to pass null to remote address and then the addr aa remote_offset

UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH = UCS_BIT(4) /**< Length of the local buffer in bytes */
};

Expand All @@ -48,6 +53,9 @@ enum ucp_device_mem_list_elem_field {
*
* This describes a pair of local and remote memory for which a memory operation
* can later be performed multiple times, possibly with varying memory offsets.
*
* @note Counter elements can omit the @a memh and @a local_addr fields.
* Data elements must have either @a memh or @a local_addr field.
*/
typedef struct ucp_device_mem_list_elem {
/**
Expand Down Expand Up @@ -80,6 +88,7 @@ typedef struct ucp_device_mem_list_elem {

/**
* Unpacked memory key for the remote memory endpoint.
* Always required.
*/
ucp_rkey_h rkey;
} ucp_device_mem_list_elem_t;
Expand Down
Loading
Loading