From 2622a8e9cc3a427f000faf1719246564b35fefbb Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Mon, 6 Oct 2025 21:19:23 -0700 Subject: [PATCH 01/17] chore: Include tests and examples in release builds Lets build these for release builds. In release builds there is some optimization added that might help with running tests and seeing performance numbers. The asserts are removed in release builds so a new macro was added to check for return status' of the API calls. Signed-off-by: Adit Ranadive --- examples/cpp/nixl_etcd_example.cpp | 34 ++++- examples/cpp/nixl_example.cpp | 46 +++++- meson.build | 7 +- test/nixl/agent_example.cpp | 136 +++++++++++++++--- test/nixl/nixl_test.cpp | 19 ++- test/unit/plugins/ucx/ucx_backend_multi.cpp | 11 ++ test/unit/plugins/ucx/ucx_backend_test.cpp | 50 +++++-- .../plugins/ucx_mo/ucx_mo_backend_test.cpp | 53 +++++-- test/unit/utils/common/map_perf.cpp | 17 +++ 9 files changed, 327 insertions(+), 46 deletions(-) diff --git a/examples/cpp/nixl_etcd_example.cpp b/examples/cpp/nixl_etcd_example.cpp index 1403a30c7..13f96c0dc 100644 --- a/examples/cpp/nixl_etcd_example.cpp +++ b/examples/cpp/nixl_etcd_example.cpp @@ -22,6 +22,15 @@ #include "nixl.h" +#define CHECK_NIXL_ERROR(result, message, agent) \ + do { \ + if (0 != result) { \ + std::cerr << "NIXL: " << message << " for agent " << agent \ + << " (Error code: " << result << ")" << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + // Change these values to match your etcd setup const std::string ETCD_ENDPOINT = "http://localhost:2379"; const std::string AGENT1_NAME = "EtcdAgent1"; @@ -132,6 +141,7 @@ int main() { ret1 = A1.getAvailPlugins(plugins); assert (ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to get available plugins", AGENT1_NAME); std::cout << "Available plugins:\n"; for (nixl_backend_t b: plugins) @@ -143,6 +153,9 @@ int main() { assert (ret1 == NIXL_SUCCESS); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to get plugin params for UCX", AGENT1_NAME); + CHECK_NIXL_ERROR(ret2, "Failed to get plugin params for UCX", AGENT2_NAME); + std::cout << "Params before init:\n"; printParams(init1, mems1); printParams(init2, mems2); @@ -155,12 +168,18 @@ int main() { assert (ret1 == NIXL_SUCCESS); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to create UCX backend", AGENT1_NAME); + CHECK_NIXL_ERROR(ret2, "Failed to create UCX backend", AGENT2_NAME); + ret1 = A1.getBackendParams(ucx1, mems1, init1); ret2 = A2.getBackendParams(ucx2, mems2, init2); assert (ret1 == NIXL_SUCCESS); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to get UCX backend params", AGENT1_NAME); + CHECK_NIXL_ERROR(ret2, "Failed to get UCX backend params", AGENT2_NAME); + std::cout << "Params after init:\n"; printParams(init1, mems1); printParams(init2, mems2); @@ -168,8 +187,10 @@ int main() { // Register memory with both agents status = registerMemory(&addr1, &A1, &dlist1, &extra_params1, ucx1, 0xaa); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to register memory", AGENT1_NAME); status = registerMemory(&addr2, &A2, &dlist2, &extra_params2, ucx2, 0xbb); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret2, "Failed to register memory", AGENT2_NAME); std::cout << "\nEtcd Metadata Exchange Demo\n"; std::cout << "==========================\n"; @@ -180,9 +201,11 @@ int main() { // Both agents send their metadata to etcd status = A1.sendLocalMD(); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to send local MD", AGENT1_NAME); status = A2.sendLocalMD(); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to send local MD", AGENT2_NAME); // Give etcd time to process std::this_thread::sleep_for(std::chrono::seconds(1)); @@ -193,10 +216,12 @@ int main() { // Agent1 fetches metadata for Agent2 status = A1.fetchRemoteMD(AGENT2_NAME); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to fetch remote MD", AGENT1_NAME); // Agent2 fetches metadata for Agent1 status = A2.fetchRemoteMD(AGENT1_NAME); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to fetch remote MD", AGENT2_NAME); // Do transfer from Agent 1 to Agent 2 size_t req_size = 8; @@ -229,6 +254,8 @@ int main() { extra_params1.hasNotif = true; ret1 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_dst_descs, AGENT2_NAME, req_handle, &extra_params1); std::cout << "Xfer request created, status: " << nixlEnumStrings::statusStr(ret1) << std::endl; + assert(ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to create Xfer Req", AGENT1_NAME); status = A1.postXferReq(req_handle); @@ -242,6 +269,8 @@ int main() { if (n_notifs == 0) ret2 = A2.getNotifs(notif_map); assert (status >= 0); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR((status < 0), "Failed to post Xfer Req", AGENT1_NAME); + CHECK_NIXL_ERROR(ret2, "Failed to get notifs", AGENT2_NAME); n_notifs = notif_map.size(); } @@ -250,11 +279,14 @@ int main() { ret1 = A1.releaseXferReq(req_handle); assert (ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to release Xfer Req", AGENT1_NAME); ret1 = A1.deregisterMem(dlist1, &extra_params1); ret2 = A2.deregisterMem(dlist2, &extra_params2); assert (ret1 == NIXL_SUCCESS); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to deregister memory", AGENT1_NAME); + CHECK_NIXL_ERROR(ret2, "Failed to deregister memory", AGENT2_NAME); // 3. Partial Metadata Exchange std::cout << "\n3. Sending partial metadata to etcd...\n"; @@ -301,7 +333,7 @@ int main() { // 4. Invalidate Metadata std::cout << "\n4. Invalidating metadata in etcd...\n"; - // Invalidate agent1's metadata + // Invalidate AGENT1_NAME's metadata status = A1.invalidateLocalMD(); assert(status == NIXL_SUCCESS); diff --git a/examples/cpp/nixl_example.cpp b/examples/cpp/nixl_example.cpp index e7eaf1073..9d93a4461 100644 --- a/examples/cpp/nixl_example.cpp +++ b/examples/cpp/nixl_example.cpp @@ -22,6 +22,15 @@ #include "nixl.h" +#define CHECK_NIXL_ERROR(result, message, agent) \ + do { \ + if (0 != result) { \ + std::cerr << "NIXL: " << message << " for agent " << agent \ + << " (Error code: " << result << ")" << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + std::string agent1("Agent001"); std::string agent2("Agent002"); @@ -30,6 +39,10 @@ void check_buf(void* buf, size_t len) { // Do some checks on the data. for(size_t i = 0; i= 0); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR((status < 0), "Failed to post Xfer Req", agent1); + CHECK_NIXL_ERROR(ret2, "Failed to get notifs", agent2); n_notifs = notif_map.size(); } std::vector agent1_notifs = notif_map[agent1]; assert (agent1_notifs.size() == 1); assert (agent1_notifs.front() == "notification"); + + CHECK_NIXL_ERROR((agent1_notifs.size() != 1), "Incorrect notif size", agent1); + CHECK_NIXL_ERROR((agent1_notifs.front() != "notification"), "Incorrect notification", agent1); notif_map[agent1].clear(); // Redundant, for testing notif_map.clear(); n_notifs = 0; @@ -233,15 +270,22 @@ main(int argc, char **argv) { ret1 = A1.releaseXferReq(req_handle); assert (ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to release Xfer Req", agent1); + ret1 = A1.deregisterMem(dlist1, &extra_params1); ret2 = A2.deregisterMem(dlist2, &extra_params2); assert (ret1 == NIXL_SUCCESS); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to deregister memory", agent1); + CHECK_NIXL_ERROR(ret2, "Failed to deregister memory", agent2); + //only initiator should call invalidate ret1 = A1.invalidateRemoteMD(agent2); assert (ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to invalidate remote MD", agent1); + free(addr1); free(addr2); diff --git a/meson.build b/meson.build index 8a7be09c9..658098ea3 100644 --- a/meson.build +++ b/meson.build @@ -235,11 +235,8 @@ plugins_inc_dirs = include_directories('src/plugins') utils_inc_dirs = include_directories('src/utils') subdir('src') - -if get_option('buildtype') != 'release' - subdir('test') - subdir('examples') -endif +subdir('test') +subdir('examples') if get_option('install_headers') install_headers('src/api/cpp/nixl.h', install_dir: prefix_inc) diff --git a/test/nixl/agent_example.cpp b/test/nixl/agent_example.cpp index 5c4c0d206..774f139fa 100644 --- a/test/nixl/agent_example.cpp +++ b/test/nixl/agent_example.cpp @@ -22,6 +22,15 @@ #include "nixl.h" +#define CHECK_NIXL_ERROR(result, message, agent) \ + do { \ + if (0 != result) { \ + std::cerr << "NIXL: " << message << " for agent " << agent \ + << " (Error code: " << result << ")" << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + std::string agent1("Agent001"); std::string agent2("Agent002"); @@ -30,6 +39,10 @@ void check_buf(void* buf, size_t len) { // Do some checks on the data. for(size_t i = 0; iregisterMem(mem_list1, &extra_params1); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to register memory", agent1); status = A2->registerMem(mem_list2, &extra_params2); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to register memory", agent2); + std::string meta2; status = A2->getLocalMD(meta2); assert (status == NIXL_SUCCESS); assert (meta2.size() > 0); + CHECK_NIXL_ERROR(status, "Failed to get local MD", agent2); std::string remote_name; status = A1->loadRemoteMD(meta2, remote_name); assert (status == NIXL_SUCCESS); assert (remote_name == agent2); + CHECK_NIXL_ERROR(status, "Failed to local remote MD", agent1); + CHECK_NIXL_ERROR((remote_name != agent2), "Incorrect remote MD received", agent1); + std::cout << "perf setup done\n"; gettimeofday(&start_time, NULL); @@ -110,9 +130,10 @@ void test_side_perf(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend, nixlBac for(int i = 0; iprepXferDlist(agent2, dst_list, dst_side[i], &extra_params1); assert (status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to prep Xfer Dlist for dest", agent1); status = A1->prepXferDlist(NIXL_INIT_AGENT, src_list, src_side[i], &extra_params1); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to pre Xfer Dlist for src", agent1); } gettimeofday(&end_time, NULL); @@ -138,6 +159,7 @@ void test_side_perf(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend, nixlBac extra_params1.hasNotif = true; status = A1->makeXferReq(NIXL_WRITE, src_side[0], indices, dst_side[0], indices, reqh1, &extra_params1); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to make Xfer Req", agent1); indices.clear(); for(int i = 0; i<(n_mems*descs_per_mem); i+=2) @@ -146,21 +168,26 @@ void test_side_perf(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend, nixlBac //should print (n_mems*descs_per_mem/2) number of final descriptors status = A1->makeXferReq(NIXL_WRITE, src_side[0], indices, dst_side[0], indices, reqh2, &extra_params1); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to make Xfer Req", agent1); status = A1->releaseXferReq(reqh1); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to release Xfer Req", agent1); status = A1->releaseXferReq(reqh2); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to release Xfer Req2", agent1); // Commented out to test auto deregistration // status = A1->deregisterMem(mem_list1, &extra_params1); // assert (status == NIXL_SUCCESS); status = A2->deregisterMem(mem_list2, &extra_params2); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to deregister memory", agent2); for(int i = 0; ireleasedDlistH(src_side[i]); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to release Dlist handle", agent1); status = A1->releasedDlistH(dst_side[i]); assert (status == NIXL_SUCCESS); } @@ -211,8 +238,10 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 status = A1->registerMem(src_mem_lists[update], &extra_params1); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to register memory", agent1); status = A2->registerMem(dst_mem_lists[update], &extra_params2); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to register memory", agent2); } // Test metadata update with only backends and empty descriptor list @@ -220,18 +249,23 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Agent2 might have already been previously loaded. // Invalidate it just in case but don't care either way. - A1->invalidateRemoteMD(agent2); + status = A1->invalidateRemoteMD(agent2); + CHECK_NIXL_ERROR(status, "Failed to invalidate remote MD", agent1); nixl_reg_dlist_t empty_dlist(DRAM_SEG); std::string partial_meta; status = A2->getLocalPartialMD(empty_dlist, partial_meta, NULL); assert(status == NIXL_SUCCESS); assert(partial_meta.size() > 0); + CHECK_NIXL_ERROR(status, "Failed to get local partial MD", agent2); + CHECK_NIXL_ERROR((partial_meta.size() <= 0), "Incorrect local partial MD", agent2); std::string remote_name; status = A1->loadRemoteMD(partial_meta, remote_name); assert(status == NIXL_SUCCESS); assert(remote_name == agent2); + CHECK_NIXL_ERROR(status, "Failed to get load remote MD", agent1); + CHECK_NIXL_ERROR((remote_name != agent2), "Incorrect remote MD", agent1); // Make sure unregistered descriptors are not updated for (int update = 0; update < NUM_UPDATES; update++) { @@ -239,11 +273,14 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 status = A1->prepXferDlist(agent2, dst_mem_lists[update].trim(), dst_side, &extra_params1); assert(status != NIXL_SUCCESS); assert(dst_side == nullptr); + CHECK_NIXL_ERROR(status, "Prep xfer dlist should not be successful", agent1); + CHECK_NIXL_ERROR((dst_side != nullptr), "Dst side is not null", agent1); } // Invalidate remote agent metadata to make sure we received connection info status = A1->invalidateRemoteMD(agent2); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to get invalidate remote MD", agent1); std::cout << "Metadata update - backends only completed\n"; // Main test loop - update metadata multiple times @@ -258,24 +295,31 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 status = A2->getLocalPartialMD(dst_mem_lists[update], partial_meta, &extra_params2); assert(status == NIXL_SUCCESS); assert(partial_meta.size() > 0); + CHECK_NIXL_ERROR(status, "Failed to get local partial MD", agent2); + CHECK_NIXL_ERROR((partial_meta.size() <= 0), "Incorrect local partial MD", agent2); // Load the partial metadata into A1 std::string remote_name; status = A1->loadRemoteMD(partial_meta, remote_name); assert(status == NIXL_SUCCESS); assert(remote_name == agent2); + CHECK_NIXL_ERROR(status, "Failed to load remote MD", agent1); + CHECK_NIXL_ERROR((remote_name != agent2), "Incorrect remote MD", agent1); // Make sure loaded descriptors are updated nixlDlistH *dst_side; status = A1->prepXferDlist(agent2, dst_mem_lists[update].trim(), dst_side, &extra_params1); assert(status == NIXL_SUCCESS); assert(dst_side != nullptr); - + CHECK_NIXL_ERROR(status, "Failed to prep xfer dlist", agent1); + CHECK_NIXL_ERROR((dst_side == nullptr), "Dst side is null", agent1); // Make sure not-loaded descriptors are not updated for (int invalid_idx = update + 1; invalid_idx < NUM_UPDATES; invalid_idx++) { status = A1->prepXferDlist(agent2, dst_mem_lists[invalid_idx].trim(), dst_side, &extra_params1); assert(status != NIXL_SUCCESS); assert(dst_side == nullptr); + CHECK_NIXL_ERROR(status, "Prep xfer dlist should not be successful", agent1); + CHECK_NIXL_ERROR((dst_side != nullptr), "Dst side is not null", agent1); } std::cout << "Metadata update #" << update << " completed\n"; } @@ -298,9 +342,12 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 status = A1->prepXferDlist(NIXL_INIT_AGENT, src_xfer_list, src_side, &extra_params1); assert(status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to prep xfer dlist", agent1); + CHECK_NIXL_ERROR((src_side == nullptr), "Src side is null", agent1); status = A1->prepXferDlist(agent2, dst_xfer_list, dst_side, &extra_params1); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to prep xfer dlist", agent1); + CHECK_NIXL_ERROR((dst_side == nullptr), "Dst side is null", agent1); std::cout << "Transfer preparation completed\n"; @@ -317,13 +364,14 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Create and post the transfer request status = A1->makeXferReq(NIXL_WRITE, src_side, indices, dst_side, indices, req, &extra_params1); assert(status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to make xfer req", agent1); nixl_status_t xfer_status = A1->postXferReq(req); // Wait for transfer completion while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req); assert (xfer_status >= 0); + CHECK_NIXL_ERROR(xfer_status, "Failed to get xfer status", agent1); } // Verify transfer results @@ -338,20 +386,21 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Cleanup status = A1->releaseXferReq(req); assert(status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to release xfer req", agent1); status = A1->releasedDlistH(src_side); assert(status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to release xfer dlist", agent1); status = A1->releasedDlistH(dst_side); assert(status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to release xfer dlist", agent1); // Deregister memory for (int update = 0; update < NUM_UPDATES; update++) { status = A1->deregisterMem(src_mem_lists[update], &extra_params1); assert(status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to deregister memory", agent1); status = A2->deregisterMem(dst_mem_lists[update], &extra_params2); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to deregister memory", agent2); } // Free allocated memory @@ -379,6 +428,9 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl assert (status == NIXL_SUCCESS); assert (src_backend); + CHECK_NIXL_ERROR(status, "Failed to query xfer backend", agent1); + CHECK_NIXL_ERROR((src_backend == nullptr), "Incorrect src backend handle", agent1); + std::cout << "Got backend\n"; test_side_perf(A1, A2, src_backend, dst_backend); @@ -413,30 +465,34 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl status = A1->registerMem(mem_list1, &extra_params1); assert (status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to register memory", agent1); status = A2->registerMem(mem_list2, &extra_params2); assert (status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to register memory", agent2); std::string meta2; status = A2->getLocalMD(meta2); assert (status == NIXL_SUCCESS); assert (meta2.size() > 0); - + CHECK_NIXL_ERROR(status, "Failed to get local MD", agent2); + CHECK_NIXL_ERROR((meta2.size() <= 0), "Incorrect local MD", agent2); std::string remote_name; status = A1->loadRemoteMD(meta2, remote_name); assert (status == NIXL_SUCCESS); assert (remote_name == agent2); - + CHECK_NIXL_ERROR(status, "Failed to load remote MD", agent1); + CHECK_NIXL_ERROR((remote_name != agent2), "Incorrect remote MD", agent1); std::cout << "Ready to prepare side\n"; nixlDlistH *src_side, *dst_side; status = A1->prepXferDlist(NIXL_INIT_AGENT, src_list, src_side, &extra_params1); assert (status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to prep xfer dlist", agent1); + CHECK_NIXL_ERROR((src_side == nullptr), "Src side is null", agent1); status = A1->prepXferDlist(remote_name, dst_list, dst_side, &extra_params1); assert (status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to prep xfer dlist", agent1); + CHECK_NIXL_ERROR((dst_side == nullptr), "Dst side is null", agent1); std::cout << "prep done, starting transfers\n"; std::vector indices1, indices2; @@ -454,12 +510,13 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl //write first half of src_bufs to dst_bufs status = A1->makeXferReq(NIXL_WRITE, src_side, indices1, dst_side, indices1, req1, &extra_params1); assert (status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to make xfer req", agent1); nixl_status_t xfer_status = A1->postXferReq(req1); while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req1); assert (xfer_status >= 0); + CHECK_NIXL_ERROR(xfer_status, "Failed to get xfer status", agent1); } for(int i = 0; i<(n_bufs/2); i++) @@ -470,12 +527,13 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl //read first half of dst_bufs back to second half of src_bufs status = A1->makeXferReq(NIXL_READ, src_side, indices2, dst_side, indices1, req2, &extra_params1); assert (status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to make xfer req", agent1); xfer_status = A1->postXferReq(req2); while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req2); assert (xfer_status >= 0); + CHECK_NIXL_ERROR(xfer_status, "Failed to get xfer status", agent1); } for(int i = (n_bufs/2); imakeXferReq(NIXL_WRITE, src_side, indices2, dst_side, indices2, req3, &extra_params1); assert (status == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(status, "Failed to make xfer req", agent1); xfer_status = A1->postXferReq(req3); while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req3); assert (xfer_status >= 0); + CHECK_NIXL_ERROR(xfer_status, "Failed to get xfer status", agent1); } for(int i = (n_bufs/2); ireleaseXferReq(req1); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to release xfer req", agent1); status = A1->releaseXferReq(req2); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to release xfer req2", agent1); status = A1->releaseXferReq(req3); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to release xfer req3", agent1); + assert(status == NIXL_SUCCESS); // Commented out to test auto deregistration // status = A1->deregisterMem(mem_list1, &extra_params1); @@ -514,8 +577,10 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl status = A1->releasedDlistH(src_side); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to release xfer src dlist", agent1); status = A1->releasedDlistH(dst_side); assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to release xfer dst dlist", agent1); for(int i = 0; i= 0); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR((status < 0), "Failed to post Xfer Req", agent1); + CHECK_NIXL_ERROR(ret2, "Failed to get notifs", agent2); n_notifs = notif_map.size(); } std::vector agent1_notifs = notif_map[agent1]; assert (agent1_notifs.size() == 1); assert (agent1_notifs.front() == "notification"); + + CHECK_NIXL_ERROR((agent1_notifs.size() != 1), "Incorrect notif size", agent1); + CHECK_NIXL_ERROR((agent1_notifs.front() != "notification"), "Incorrect notification", agent1); notif_map[agent1].clear(); // Redundant, for testing notif_map.clear(); n_notifs = 0; @@ -730,10 +814,12 @@ main(int argc, char **argv) { std::cout << "performing partialMdTest with backends " << bknd1 << " " << bknd2 << "\n"; ret1 = partialMdTest(&A1, &A2, bknd1, bknd2); assert (ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Fail to run partialMDTest", agent1); std::cout << "performing sideXferTest with backends " << bknd1 << " " << bknd2 << "\n"; ret1 = sideXferTest(&A1, &A2, req_handle, bknd2); assert (ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Fail to run sideXferTest", agent1); std::cout << "Performing local test\n"; extra_params1.notifMsg = "local_notif"; @@ -741,6 +827,8 @@ main(int argc, char **argv) { ret2 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_ldst_descs, agent1, req_handle2, &extra_params1); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to create Xfer Req", agent1); + status = A1.postXferReq(req_handle2); std::cout << "Local transfer was posted\n"; @@ -749,6 +837,8 @@ main(int argc, char **argv) { if (n_notifs == 0) ret2 = A1.getNotifs(notif_map); assert (status >= 0); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR((status < 0), "Failed to post Xfer Req", agent1); + CHECK_NIXL_ERROR(ret2, "Failed to get notifs", agent2); n_notifs = notif_map.size(); } @@ -757,24 +847,34 @@ main(int argc, char **argv) { assert (agent1_notifs.front() == "local_notif"); assert (equal_buf((void*) req_src.addr, (void*) req_ldst.addr, req_size) == true); + CHECK_NIXL_ERROR((agent1_notifs.size() != 1), "Incorrect notif size", agent1); + CHECK_NIXL_ERROR((agent1_notifs.front() != "local_notif"), "Incorrect notification", agent1); + CHECK_NIXL_ERROR((!equal_buf((void *)req_src.addr, (void *)req_ldst.addr, req_size) == true), + "Buffer mismatch after transfer", + agent1); ret1 = A1.releaseXferReq(req_handle); ret2 = A1.releaseXferReq(req_handle2); assert (ret1 == NIXL_SUCCESS); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to release Xfer Req", agent1); + CHECK_NIXL_ERROR(ret2, "Failed to release Xfer Req2", agent1); ret1 = A1.deregisterMem(dlist1, &extra_params1); ret2 = A2.deregisterMem(dlist2, &extra_params2); assert (ret1 == NIXL_SUCCESS); assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to deregister memory", agent1); + CHECK_NIXL_ERROR(ret2, "Failed to deregister memory", agent2); //only initiator should call invalidate ret1 = A1.invalidateRemoteMD(agent2); //A2.invalidateRemoteMD(agent1); assert (ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to invalidate remote MD", agent1); free(addr1); free(addr2); free(addr3); diff --git a/test/nixl/nixl_test.cpp b/test/nixl/nixl_test.cpp index 03095c2cf..adb99393a 100644 --- a/test/nixl/nixl_test.cpp +++ b/test/nixl/nixl_test.cpp @@ -83,7 +83,13 @@ static void targetThread(nixlAgent &agent, nixl_opt_args_t *extra_params, int th /** Only send desc list */ nixlSerDes serdes; - assert(dram_for_ucx.trim().serialize(&serdes) == NIXL_SUCCESS); + nixl_status_t st = dram_for_ucx.trim().serialize(&serdes); + assert(st == NIXL_SUCCESS); + + if (st != NIXL_SUCCESS) { + std::cerr << "Failed to serialize registry dlist " << st << std::endl; + exit(EXIT_FAILURE); + } std::cout << "Thread " << thread_id << " Wait for initiator and then send xfer descs\n"; std::string message = serdes.exportStr(); @@ -145,6 +151,11 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params, nixl_status_t ret = agent.getNotifs(notifs, extra_params); assert(ret >= 0); + if (ret < 0) { + std::cerr << "Failed to get notifs, status: " << ret << std::endl; + exit(EXIT_FAILURE); + } + if (notifs.size() > 0) { std::lock_guard lock(shared_state.mtx); for (const auto ¬if : notifs[target]) { @@ -195,6 +206,12 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params, ret = agent.getXferStatus(treq); assert(ret >= 0); } + + if (ret != NIXL_SUCCESS) { + std::cerr << "Thread " << thread_id << " Error getting transfer status " << ret << "\n"; + exit(-1); + } + std::cout << "Thread " << thread_id << " Completed Sending Data using UCX backend\n"; agent.releaseXferReq(treq); agent.invalidateLocalMD(&md_extra_params); diff --git a/test/unit/plugins/ucx/ucx_backend_multi.cpp b/test/unit/plugins/ucx/ucx_backend_multi.cpp index cf3cdab0f..e00bc047b 100644 --- a/test/unit/plugins/ucx/ucx_backend_multi.cpp +++ b/test/unit/plugins/ucx/ucx_backend_multi.cpp @@ -20,6 +20,15 @@ #include "ucx_backend.h" +#define CHECK_NIXL_ERROR(result, message, agent) \ + do { \ + if (0 != result) { \ + std::cerr << "NIXL: " << message << " for agent " << agent \ + << " (Error code: " << result << ")" << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + // Temporarily while fixing CI/CD pipeline #define USE_PTHREAD false @@ -61,12 +70,14 @@ void test_thread(int id) ret = ucx->loadRemoteConnInfo(other, conn_info[!id]); assert(ret == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret, "Failed to load remote conn info", my_name); //one-sided connect if(!id) ret = ucx->connect(other); assert(ret == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret, "Failed to connect", my_name); done[id] = true; while(!done[!id]) diff --git a/test/unit/plugins/ucx/ucx_backend_test.cpp b/test/unit/plugins/ucx/ucx_backend_test.cpp index 6f893b800..509096107 100644 --- a/test/unit/plugins/ucx/ucx_backend_test.cpp +++ b/test/unit/plugins/ucx/ucx_backend_test.cpp @@ -23,6 +23,15 @@ using namespace std; +#define CHECK_NIXL_ERROR(result, message) \ + do { \ + if (0 != result) { \ + std::cerr << "NIXL: " << message << " (Error code: " << result << ")" << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + + #ifdef HAVE_CUDA #include @@ -144,8 +153,8 @@ std::string memType2Str(nixl_mem_t mem_type) case FILE_SEG: return std::string("FILE"); default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -203,10 +212,11 @@ void allocateBuffer(nixl_mem_t mem_type, int dev_id, size_t len, void* &addr) } #endif default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } assert(addr); + CHECK_NIXL_ERROR((addr == nullptr), "Failed to allocate buffer"); } void releaseBuffer(nixl_mem_t mem_type, int dev_id, void* &addr) @@ -222,8 +232,8 @@ void releaseBuffer(nixl_mem_t mem_type, int dev_id, void* &addr) break; #endif default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -240,8 +250,8 @@ void doMemset(nixl_mem_t mem_type, int dev_id, void *addr, char byte, size_t len break; #endif default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -259,8 +269,8 @@ void *getValidationPtr(nixl_mem_t mem_type, void *addr, size_t len) } #endif default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -275,15 +285,15 @@ void *releaseValidationPtr(nixl_mem_t mem_type, void *addr) break; #endif default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } return NULL; } void allocateWrongGPUTest(nixlUcxEngine *ucx, int dev_id) { - nixlBlobDesc desc; + nixlBlobDesc desc = {0}; nixlBackendMD* md; void* buf; @@ -295,6 +305,7 @@ allocateWrongGPUTest(nixlUcxEngine *ucx, int dev_id) { int ret = ucx->registerMem(desc, VRAM_SEG, md); assert(ret == NIXL_ERR_NOT_SUPPORTED); + CHECK_NIXL_ERROR((ret != NIXL_ERR_NOT_SUPPORTED), "Failed to register memory"); releaseBuffer(VRAM_SEG, dev_id, buf); } @@ -317,6 +328,7 @@ allocateAndRegister(nixlUcxEngine *ucx, int ret = ucx->registerMem(desc, mem_type, md); assert(ret == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret, "Failed to register memory"); } void @@ -345,10 +357,12 @@ loadRemote(nixlUcxEngine *ucx, ucx->getPublicData(lmd, info.metaInfo); assert(info.metaInfo.size() > 0); + CHECK_NIXL_ERROR((info.metaInfo.size() == 0), "Failed to get public data"); // We get the data from the cetnral location and populate the backend, and receive remote_meta int ret = ucx->loadRemoteMD (info, mem_type, agent, rmd); assert(NIXL_SUCCESS == ret); + CHECK_NIXL_ERROR(ret, "Failed to load remote MD"); } void populateDescs(nixl_meta_dlist_t &descs, int dev_id, void *addr, int desc_cnt, size_t desc_size, nixlBackendMD* &md) @@ -412,12 +426,14 @@ performTransfer(nixlUcxEngine *ucx1, nixlBackendReqH *new_handle = nullptr; ret3 = ucx1->prepXfer(op, req_src_descs, req_dst_descs, remote_agent, new_handle, &opt_args); assert(ret3 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret3, "Failed to prep xfer"); hiter.setHandle(new_handle); } nixlBackendReqH *&handle = hiter.getHandle(); ret3 = ucx1->postXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); assert( ret3 == NIXL_SUCCESS || ret3 == NIXL_IN_PROG); - + CHECK_NIXL_ERROR(ret3, "Failed to post xfer"); + CHECK_NIXL_ERROR(!((ret3 == NIXL_SUCCESS) || (ret3 == NIXL_IN_PROG)), "Failed to post xfer"); if (ret3 == NIXL_SUCCESS) { cout << "\t\tWARNING: Tansfer request completed immediately - no testing non-inline path" << endl; @@ -430,6 +446,8 @@ performTransfer(nixlUcxEngine *ucx1, ucx2->progress(); } assert( ret3 == NIXL_SUCCESS || ret3 == NIXL_IN_PROG); + CHECK_NIXL_ERROR(!((ret3 == NIXL_SUCCESS) || (ret3 == NIXL_IN_PROG)), + "Failed to check xfer"); } } @@ -452,12 +470,16 @@ performTransfer(nixlUcxEngine *ucx1, ucx1->progress(); } assert(ret3 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret3, "Failed to get notifs"); } assert(ret2 == 1); assert(target_notifs.front().first == "Agent1"); assert(target_notifs.front().second == test_str); + CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); + CHECK_NIXL_ERROR((target_notifs.front().second != test_str), + "Incorrect front notif message"); cout << "OK" << endl; } @@ -470,6 +492,7 @@ performTransfer(nixlUcxEngine *ucx1, // Perform correctness check. for(size_t i = 0; i < len; i++){ assert( ((uint8_t*) chkptr1)[i] == ((uint8_t*) chkptr2)[i]); + CHECK_NIXL_ERROR((((uint8_t *)chkptr1)[i] != ((uint8_t *)chkptr2)[i]), "Data mismatch"); } releaseValidationPtr(req_src_descs.getType(), chkptr1); @@ -500,8 +523,10 @@ test_intra_agent_transfer(bool p_thread, nixlUcxEngine *ucx, nixl_mem_t mem_type std::string conn_info1; ret1 = ucx->getConnInfo(conn_info1); assert(ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to get conn info"); ret1 = ucx->loadRemoteConnInfo (agent1, conn_info1); assert(ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret1, "Failed to load remote conn info"); std::cout << "Local connection complete\n"; @@ -520,7 +545,7 @@ test_intra_agent_transfer(bool p_thread, nixlUcxEngine *ucx, nixl_mem_t mem_type nixlBackendMD* rmd2; ret1 = ucx->loadLocalMD (lmd2, rmd2); assert(ret1 == NIXL_SUCCESS); - + CHECK_NIXL_ERROR(ret1, "Failed to load local MD"); nixl_meta_dlist_t req_src_descs (mem_type); populateDescs(req_src_descs, 0, addr1, desc_cnt, desc_size, lmd1); @@ -586,12 +611,15 @@ test_inter_agent_transfer(bool p_thread, std::string conn_info1, conn_info2; ret = ucx1->getConnInfo(conn_info1); assert(ret == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret, "Failed to get conn info"); ret = ucx2->getConnInfo(conn_info2); assert(ret == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret, "Failed to get conn info"); // We assumed we put them to central location and now receiving it on the other process ret = ucx1->loadRemoteConnInfo (agent2, conn_info2); assert(ret == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret, "Failed to load remote conn info"); // TODO: Causes race condition - investigate conn management implementation // ret = ucx2->loadRemoteConnInfo (agent1, conn_info1); @@ -663,12 +691,16 @@ test_inter_agent_transfer(bool p_thread, ret2 = ucx2->getNotifs(target_notifs); ret = target_notifs.size(); assert(ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret2, "Failed to get notifs"); } assert(ret == 1); assert(target_notifs.front().first == "Agent1"); + CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); assert(target_notifs.front().second == test_str); + CHECK_NIXL_ERROR((target_notifs.front().second != test_str), + "Incorrect front notif message"); cout << "OK" << endl; } diff --git a/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp b/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp index fd23ba7ec..c4cb6080e 100644 --- a/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp +++ b/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp @@ -23,6 +23,14 @@ using namespace std; +#define CHECK_NIXL_ERROR(result, message) \ + do { \ + if (0 != result) { \ + std::cerr << "NIXL: " << message << " (Error code: " << result << ")" << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + #ifdef HAVE_CUDA #include @@ -66,8 +74,8 @@ std::string memType2Str(nixl_mem_t mem_type) case FILE_SEG: return std::string("FILE"); default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -86,6 +94,7 @@ nixlBackendEngine *createEngine(std::string name, uint32_t ndev, bool p_thread) ucx_mo = (nixlBackendEngine*) new nixlUcxMoEngine (&init); assert(!ucx_mo->getInitErr()); + CHECK_NIXL_ERROR(ucx_mo->getInitErr(), "Failed to initialize worker1"); if (ucx_mo->getInitErr()) { std::cout << "Failed to initialize worker1" << std::endl; exit(1); @@ -133,13 +142,15 @@ static int cudaQueryAddr(void *address, bool &is_dev, void allocateBuffer(nixl_mem_t mem_type, int dev_id, size_t len, void* &addr) { + int ret; switch(mem_type) { case DRAM_SEG: //addr = calloc(1, len); - posix_memalign(&addr, 4096, len); + ret = posix_memalign(&addr, 4096, len); + CHECK_NIXL_ERROR(ret, "Failed to allocate mem aligned buffer"); break; #ifdef HAVE_CUDA - case VRAM_SEG:{ + case VRAM_SEG: { bool is_dev; CUdevice dev; CUcontext ctx; @@ -153,10 +164,11 @@ void allocateBuffer(nixl_mem_t mem_type, int dev_id, size_t len, void* &addr) } #endif default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } assert(addr); + CHECK_NIXL_ERROR((addr == nullptr), "Failed to allocate buffer"); } void releaseBuffer(nixl_mem_t mem_type, int dev_id, void* &addr) @@ -172,8 +184,8 @@ void releaseBuffer(nixl_mem_t mem_type, int dev_id, void* &addr) break; #endif default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -190,8 +202,8 @@ void doMemset(nixl_mem_t mem_type, int dev_id, void *addr, char byte, size_t len break; #endif default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -209,8 +221,8 @@ void *getValidationPtr(nixl_mem_t mem_type, void *addr, size_t len) } #endif default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -225,8 +237,8 @@ void *releaseValidationPtr(nixl_mem_t mem_type, void *addr) break; #endif default: - std::cout << "Unsupported memory type!" << std::endl; assert(0); + CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } return NULL; } @@ -270,6 +282,7 @@ void createLocalDescs(nixlBackendEngine *ucx, nixl_meta_dlist_t &descs, *((nixlBasicDesc*)&desc_m) = desc; int ret = ucx->registerMem(desc_s, descs.getType(), desc_m.metadataP); assert(ret == NIXL_SUCCESS); + CHECK_NIXL_ERROR(ret, "Failed to register ucx memory"); descs.addDesc(desc_m); } } @@ -310,10 +323,12 @@ void createRemoteDescs(nixlBackendEngine *src_ucx, } else { status = src_ucx->getPublicData(src_descs[i].metadataP, desc_s.metaInfo); assert(NIXL_SUCCESS == status); + CHECK_NIXL_ERROR(status, "Failed to get src_ucx public data"); status = dst_ucx->loadRemoteMD (desc_s, src_descs.getType(), agent, desc_m.metadataP); } assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to load dst_ucx remote MD"); dst_descs.addDesc(desc_m); } } @@ -325,6 +340,7 @@ void destroyRemoteDescs(nixlBackendEngine *dst_ucx, for(int i = 0; i < dst_descs.descCount(); i++) { status = dst_ucx->unloadMD (dst_descs[i].metadataP); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to unload dst_ucx MD"); } while(dst_descs.descCount()) { @@ -356,8 +372,10 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, // Also maybe we would remove the WRITE and let the backend class decide the op status = ucx1->prepXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR(status, "Failed to prep ucx1 xfer"); status = ucx1->postXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); assert(status == NIXL_SUCCESS || status == NIXL_IN_PROG); + CHECK_NIXL_ERROR(status, "Failed to post ucx1 xfer"); if (status == NIXL_SUCCESS) { @@ -371,6 +389,8 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, ((nixlUcxMoEngine *)ucx2)->progress(); } assert( (NIXL_SUCCESS == status) || (NIXL_IN_PROG == status) ); + CHECK_NIXL_ERROR(!((NIXL_SUCCESS == status) || (NIXL_IN_PROG == status)), + "Failed to check ucx1 xfer"); } ucx1->releaseReqH(handle); } @@ -384,14 +404,19 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, while(!target_notifs.size()){ status = ucx2->getNotifs(target_notifs); assert(NIXL_SUCCESS == status); + CHECK_NIXL_ERROR(status, "Failed to get ucx2 notifs"); if(progress){ ((nixlUcxMoEngine *)ucx1)->progress(); } } assert(target_notifs.size() == 1); + CHECK_NIXL_ERROR((target_notifs.size() != 1), "Incorrect number of target notifs"); assert(target_notifs.front().first == "Agent1"); + CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); assert(target_notifs.front().second == test_str); + CHECK_NIXL_ERROR((target_notifs.front().second != test_str), + "Incorrect front notif message"); cout << "OK" << endl; } @@ -403,6 +428,7 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, auto sdesc = req_src_descs[i]; auto ddesc = req_dst_descs[i]; assert(sdesc.len == ddesc.len); + CHECK_NIXL_ERROR((sdesc.len != ddesc.len), "Data length mismatch"); size_t len = ddesc.len; chkptr1 = getValidationPtr(req_src_descs.getType(), (void*)sdesc.addr, len); chkptr2 = getValidationPtr(req_dst_descs.getType(), (void*)ddesc.addr, len); @@ -410,6 +436,7 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, // Perform correctness check. for(size_t i = 0; i < len; i++){ assert( ((uint8_t*) chkptr1)[i] == ((uint8_t*) chkptr2)[i]); + CHECK_NIXL_ERROR((((uint8_t *)chkptr1)[i] != ((uint8_t *)chkptr2)[i]), "Data mismatch"); } releaseValidationPtr(req_src_descs.getType(), chkptr1); @@ -451,18 +478,18 @@ void test_agent_transfer(bool p_thread, std::string conn_info1; status = ucx1->getConnInfo(conn_info1); assert(NIXL_SUCCESS == status); - + CHECK_NIXL_ERROR(status, "Failed to get ucx1 conn info"); std::string conn_info2; status = ucx2->getConnInfo(conn_info2); assert(NIXL_SUCCESS == status); - + CHECK_NIXL_ERROR(status, "Failed to get ucx2 conn info"); // We assumed we put them to central location and now receiving it on the other process if (is_local) { agent = &agent1; } status = ucx1->loadRemoteConnInfo (*agent, conn_info2); assert(NIXL_SUCCESS == status); - + CHECK_NIXL_ERROR(status, "Failed to load ucx1 remote conn info"); // TODO: Causes race condition - investigate conn management implementation // ret = ucx2->loadRemoteConnInfo (agent1, conn_info1); @@ -531,8 +558,12 @@ void test_agent_transfer(bool p_thread, } assert(target_notifs.size() == 1); + CHECK_NIXL_ERROR((target_notifs.size() != 1), "Incorrect number of target notifs"); assert(target_notifs.front().first == "Agent1"); + CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); assert(target_notifs.front().second == test_str); + CHECK_NIXL_ERROR((target_notifs.front().second != test_str), + "Incorrect front notif message"); cout << "OK" << endl; } diff --git a/test/unit/utils/common/map_perf.cpp b/test/unit/utils/common/map_perf.cpp index 13e731b65..44d3be2aa 100644 --- a/test/unit/utils/common/map_perf.cpp +++ b/test/unit/utils/common/map_perf.cpp @@ -25,6 +25,15 @@ #include "common/str_tools.h" +#define CHECK_NIXL_ERROR(result, message, agent) \ + do { \ + if (0 != result) { \ + std::cerr << "NIXL: " << message << " for agent " << agent \ + << " (Error code: " << result << ")" << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + std::string generate_random_string(size_t length) { const std::string characters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; std::random_device random_device; @@ -137,11 +146,19 @@ int main() { strEqual tester; assert(tester.operator() ("abcdefgh","abcdefgh") == true); + CHECK_NIXL_ERROR(tester.operator()("abcdefgh", "abcdefgh"), "Test failed", "test"); assert(tester.operator() ("abcdefgh","abdcefgh") == false); + CHECK_NIXL_ERROR(tester.operator()("abcdefgh", "abdcefgh"), "Test failed", "test"); assert(tester.operator() ("abcdefgh123","abcdefgh123") == true); + CHECK_NIXL_ERROR(tester.operator()("abcdefgh123", "abcdefgh123"), "Test failed", "test"); assert(tester.operator() ("abcdefgh123","aadcefgh123") == false); + CHECK_NIXL_ERROR(tester.operator()("abcdefgh123", "aadcefgh123"), "Test failed", "test"); assert(tester.operator() ("12345678abcdefgh","12345678abcdefgh") == true); + CHECK_NIXL_ERROR( + tester.operator()("12345678abcdefgh", "12345678abcdefgh"), "Test failed", "test"); assert(tester.operator() ("12345678abcdefgh","12345687abcdefgh") == false); + CHECK_NIXL_ERROR( + tester.operator()("12345678abcdefgh", "12345687abcdefgh"), "Test failed", "test"); test_comparison_perf(16, 8); test_comparison_perf(16, 16); From 88368edda746b22cf7ab104925b976ef59795f06 Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Tue, 7 Oct 2025 00:28:53 -0700 Subject: [PATCH 02/17] Fix release build Signed-off-by: Adit Ranadive --- test/nixl/agent_example.cpp | 2 +- test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/nixl/agent_example.cpp b/test/nixl/agent_example.cpp index 774f139fa..2f5914790 100644 --- a/test/nixl/agent_example.cpp +++ b/test/nixl/agent_example.cpp @@ -273,7 +273,7 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 status = A1->prepXferDlist(agent2, dst_mem_lists[update].trim(), dst_side, &extra_params1); assert(status != NIXL_SUCCESS); assert(dst_side == nullptr); - CHECK_NIXL_ERROR(status, "Prep xfer dlist should not be successful", agent1); + CHECK_NIXL_ERROR((status == NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); CHECK_NIXL_ERROR((dst_side != nullptr), "Dst side is not null", agent1); } diff --git a/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu b/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu index 2e09e2a98..0b368e4af 100644 --- a/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu +++ b/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu @@ -225,7 +225,7 @@ main (int argc, char *argv[]) { nixl_notifs_t notifs; size_t buf_size = SIZE; uint32_t buf_num = TRANSFER_NUM_BUFFER; - uintptr_t data_address_ptr; + uintptr_t data_address_ptr = 0; /** Argument Parsing */ if (argc < 5) { From d6c0fbcfc8ca87f99caa2dae6ce0189d4454553b Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Wed, 8 Oct 2025 01:58:18 -0700 Subject: [PATCH 03/17] Address comments Signed-off-by: Adit Ranadive --- examples/cpp/nixl_etcd_example.cpp | 88 ++--- src/utils/common/util.h | 18 + test/nixl/agent_example.cpp | 344 +++++++----------- test/nixl/nixl_test.cpp | 23 +- test/unit/plugins/gpunetio/meson.build | 3 +- .../gpunetio/nixl_gpunetio_stream_test.cu | 47 ++- test/unit/plugins/ucx/ucx_backend_multi.cpp | 16 +- test/unit/plugins/ucx/ucx_backend_test.cpp | 64 +--- .../plugins/ucx_mo/ucx_mo_backend_test.cpp | 40 +- test/unit/utils/common/map_perf.cpp | 31 +- 10 files changed, 222 insertions(+), 452 deletions(-) diff --git a/examples/cpp/nixl_etcd_example.cpp b/examples/cpp/nixl_etcd_example.cpp index 13f96c0dc..c2a3c4701 100644 --- a/examples/cpp/nixl_etcd_example.cpp +++ b/examples/cpp/nixl_etcd_example.cpp @@ -15,21 +15,13 @@ * limitations under the License. */ #include -#include #include #include #include +#include "common/util.h" #include "nixl.h" -#define CHECK_NIXL_ERROR(result, message, agent) \ - do { \ - if (0 != result) { \ - std::cerr << "NIXL: " << message << " for agent " << agent \ - << " (Error code: " << result << ")" << std::endl; \ - exit(EXIT_FAILURE); \ - } \ - } while (0) // Change these values to match your etcd setup const std::string ETCD_ENDPOINT = "http://localhost:2379"; @@ -139,9 +131,8 @@ int main() { std::vector plugins; ret1 = A1.getAvailPlugins(plugins); - assert (ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get available plugins", AGENT1_NAME); - CHECK_NIXL_ERROR(ret1, "Failed to get available plugins", AGENT1_NAME); std::cout << "Available plugins:\n"; for (nixl_backend_t b: plugins) @@ -150,11 +141,8 @@ int main() { ret1 = A1.getPluginParams("UCX", mems1, init1); ret2 = A2.getPluginParams("UCX", mems2, init2); - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to get plugin params for UCX", AGENT1_NAME); - CHECK_NIXL_ERROR(ret2, "Failed to get plugin params for UCX", AGENT2_NAME); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get plugin params for UCX", AGENT1_NAME); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get plugin params for UCX", AGENT2_NAME); std::cout << "Params before init:\n"; printParams(init1, mems1); @@ -164,21 +152,15 @@ int main() { nixlBackendH* ucx1, *ucx2; ret1 = A1.createBackend("UCX", init1, ucx1); ret2 = A2.createBackend("UCX", init2, ucx2); - - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to create UCX backend", AGENT1_NAME); - CHECK_NIXL_ERROR(ret2, "Failed to create UCX backend", AGENT2_NAME); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create UCX backend", AGENT1_NAME); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to create UCX backend", AGENT2_NAME); ret1 = A1.getBackendParams(ucx1, mems1, init1); ret2 = A2.getBackendParams(ucx2, mems2, init2); - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get UCX backend params", AGENT1_NAME); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get UCX backend params", AGENT2_NAME); - CHECK_NIXL_ERROR(ret1, "Failed to get UCX backend params", AGENT1_NAME); - CHECK_NIXL_ERROR(ret2, "Failed to get UCX backend params", AGENT2_NAME); std::cout << "Params after init:\n"; printParams(init1, mems1); @@ -186,11 +168,9 @@ int main() { // Register memory with both agents status = registerMemory(&addr1, &A1, &dlist1, &extra_params1, ucx1, 0xaa); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to register memory", AGENT1_NAME); + CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", AGENT1_NAME); status = registerMemory(&addr2, &A2, &dlist2, &extra_params2, ucx2, 0xbb); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(ret2, "Failed to register memory", AGENT2_NAME); + CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", AGENT2_NAME); std::cout << "\nEtcd Metadata Exchange Demo\n"; std::cout << "==========================\n"; @@ -200,12 +180,10 @@ int main() { // Both agents send their metadata to etcd status = A1.sendLocalMD(); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to send local MD", AGENT1_NAME); + CHECK_NIXL_ERROR_AGENT(status, "Failed to send local MD", AGENT1_NAME); status = A2.sendLocalMD(); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to send local MD", AGENT2_NAME); + CHECK_NIXL_ERROR_AGENT(status, "Failed to send local MD", AGENT2_NAME); // Give etcd time to process std::this_thread::sleep_for(std::chrono::seconds(1)); @@ -215,13 +193,11 @@ int main() { // Agent1 fetches metadata for Agent2 status = A1.fetchRemoteMD(AGENT2_NAME); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(ret1, "Failed to fetch remote MD", AGENT1_NAME); + CHECK_NIXL_ERROR_AGENT(status, "Failed to fetch remote MD", AGENT1_NAME); // Agent2 fetches metadata for Agent1 status = A2.fetchRemoteMD(AGENT1_NAME); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(ret1, "Failed to fetch remote MD", AGENT2_NAME); + CHECK_NIXL_ERROR_AGENT(status, "Failed to fetch remote MD", AGENT2_NAME); // Do transfer from Agent 1 to Agent 2 size_t req_size = 8; @@ -254,8 +230,7 @@ int main() { extra_params1.hasNotif = true; ret1 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_dst_descs, AGENT2_NAME, req_handle, &extra_params1); std::cout << "Xfer request created, status: " << nixlEnumStrings::statusStr(ret1) << std::endl; - assert(ret1 == NIXL_SUCCESS); - CHECK_NIXL_ERROR(ret1, "Failed to create Xfer Req", AGENT1_NAME); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create Xfer Req", AGENT1_NAME); status = A1.postXferReq(req_handle); @@ -267,26 +242,21 @@ int main() { while (status != NIXL_SUCCESS || n_notifs == 0) { if (status != NIXL_SUCCESS) status = A1.getXferStatus(req_handle); if (n_notifs == 0) ret2 = A2.getNotifs(notif_map); - assert (status >= 0); - assert (ret2 == NIXL_SUCCESS); - CHECK_NIXL_ERROR((status < 0), "Failed to post Xfer Req", AGENT1_NAME); - CHECK_NIXL_ERROR(ret2, "Failed to get notifs", AGENT2_NAME); + CHECK_NIXL_ERROR_AGENT(status, "Failed to post Xfer Req", AGENT1_NAME); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get notifs", AGENT2_NAME); n_notifs = notif_map.size(); } std::cout << "Transfer verified\n"; ret1 = A1.releaseXferReq(req_handle); - assert (ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to release Xfer Req", AGENT1_NAME); - CHECK_NIXL_ERROR(ret1, "Failed to release Xfer Req", AGENT1_NAME); ret1 = A1.deregisterMem(dlist1, &extra_params1); ret2 = A2.deregisterMem(dlist2, &extra_params2); - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to deregister memory", AGENT1_NAME); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to deregister memory", AGENT2_NAME); - CHECK_NIXL_ERROR(ret1, "Failed to deregister memory", AGENT1_NAME); - CHECK_NIXL_ERROR(ret2, "Failed to deregister memory", AGENT2_NAME); // 3. Partial Metadata Exchange std::cout << "\n3. Sending partial metadata to etcd...\n"; @@ -306,27 +276,27 @@ int main() { // Send partial metadata status = A1.sendLocalPartialMD(empty_dlist1, &conn_params1); - assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to send local partial MD", AGENT1_NAME); status = A2.sendLocalPartialMD(empty_dlist2, &conn_params2); - assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to send local partial MD", AGENT2_NAME); // Send once partial with different label conn_params1.metadataLabel = PARTIAL_LABEL_2; status = A1.sendLocalPartialMD(empty_dlist1, &conn_params1); - assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to send local partial MD", AGENT1_NAME); conn_params2.metadataLabel = PARTIAL_LABEL_2; status = A2.sendLocalPartialMD(empty_dlist2, &conn_params2); - assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to send local partial MD", AGENT2_NAME); nixl_opt_args_t fetch_params; fetch_params.metadataLabel = PARTIAL_LABEL_1; status = A1.fetchRemoteMD(AGENT2_NAME, &fetch_params); - assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to fetch remote MD", AGENT1_NAME); status = A2.fetchRemoteMD(AGENT1_NAME, &fetch_params); - assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to fetch remote MD", AGENT2_NAME); std::this_thread::sleep_for(std::chrono::seconds(1)); @@ -335,7 +305,7 @@ int main() { // Invalidate AGENT1_NAME's metadata status = A1.invalidateLocalMD(); - assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to invalidate local MD", AGENT1_NAME); std::this_thread::sleep_for(std::chrono::seconds(1)); @@ -348,14 +318,14 @@ int main() { // Try invalidating again, this should log a debug message std::cout << "Trying to invalidate again...\n"; status = A1.invalidateLocalMD(); - assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to invalidate local MD", AGENT1_NAME); std::this_thread::sleep_for(std::chrono::seconds(1)); // 5. Fetch metadata with invalid label. This should not block forever and print error message. std::cout << "\n5. Fetching metadata with invalid label...\n"; status = A2.fetchRemoteMD("INVALID_AGENT", &fetch_params); - assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to fetch remote MD", AGENT2_NAME); std::this_thread::sleep_for(std::chrono::seconds(1)); diff --git a/src/utils/common/util.h b/src/utils/common/util.h index bffa2efa1..27d26855d 100644 --- a/src/utils/common/util.h +++ b/src/utils/common/util.h @@ -21,4 +21,22 @@ #define CONCAT_0(a, b) a ## b #define UNIQUE_NAME(name) CONCAT(name, __COUNTER__) +#define CHECK_NIXL_ERROR_AGENT(result, message, agent) \ + do { \ + if (0 != result) { \ + std::cerr << "NIXL: " << message << " for agent " << agent \ + << " (Error code: " << result << ")" << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CHECK_NIXL_ERROR(result, message) \ + do { \ + if (0 != result) { \ + std::cerr << "NIXL: " << message << " (Error code: " \ + << result << ")" << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + #endif /* UTIL_H */ diff --git a/test/nixl/agent_example.cpp b/test/nixl/agent_example.cpp index 2f5914790..9de8a9b95 100644 --- a/test/nixl/agent_example.cpp +++ b/test/nixl/agent_example.cpp @@ -15,21 +15,12 @@ * limitations under the License. */ #include -#include #include #include #include "nixl.h" - -#define CHECK_NIXL_ERROR(result, message, agent) \ - do { \ - if (0 != result) { \ - std::cerr << "NIXL: " << message << " for agent " << agent \ - << " (Error code: " << result << ")" << std::endl; \ - exit(EXIT_FAILURE); \ - } \ - } while (0) +#include "common/util.h" std::string agent1("Agent001"); std::string agent2("Agent002"); @@ -38,11 +29,7 @@ void check_buf(void* buf, size_t len) { // Do some checks on the data. for(size_t i = 0; iregisterMem(mem_list1, &extra_params1); - assert (status == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(status, "Failed to register memory", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent1); status = A2->registerMem(mem_list2, &extra_params2); - assert (status == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(status, "Failed to register memory", agent2); + CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent2); std::string meta2; status = A2->getLocalMD(meta2); - assert (status == NIXL_SUCCESS); - assert (meta2.size() > 0); + CHECK_NIXL_ERROR_AGENT(status, "Failed to get local MD", agent2); + CHECK_NIXL_ERROR_AGENT((meta2.size() <= 0), "Incorrect local MD", agent2); + - CHECK_NIXL_ERROR(status, "Failed to get local MD", agent2); std::string remote_name; status = A1->loadRemoteMD(meta2, remote_name); - assert (status == NIXL_SUCCESS); - assert (remote_name == agent2); - CHECK_NIXL_ERROR(status, "Failed to local remote MD", agent1); - CHECK_NIXL_ERROR((remote_name != agent2), "Incorrect remote MD received", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to local remote MD", agent1); + CHECK_NIXL_ERROR_AGENT((remote_name != agent2), "Incorrect remote MD received", agent1); std::cout << "perf setup done\n"; @@ -129,11 +110,9 @@ void test_side_perf(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend, nixlBac for(int i = 0; iprepXferDlist(agent2, dst_list, dst_side[i], &extra_params1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to prep Xfer Dlist for dest", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to prep Xfer Dlist for dest", agent1); status = A1->prepXferDlist(NIXL_INIT_AGENT, src_list, src_side[i], &extra_params1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to pre Xfer Dlist for src", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to pre Xfer Dlist for src", agent1); } gettimeofday(&end_time, NULL); @@ -158,8 +137,7 @@ void test_side_perf(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend, nixlBac extra_params1.notifMsg = "test"; extra_params1.hasNotif = true; status = A1->makeXferReq(NIXL_WRITE, src_side[0], indices, dst_side[0], indices, reqh1, &extra_params1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to make Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to make Xfer Req", agent1); indices.clear(); for(int i = 0; i<(n_mems*descs_per_mem); i+=2) @@ -167,29 +145,24 @@ void test_side_perf(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend, nixlBac //should print (n_mems*descs_per_mem/2) number of final descriptors status = A1->makeXferReq(NIXL_WRITE, src_side[0], indices, dst_side[0], indices, reqh2, &extra_params1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to make Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to make Xfer Req", agent1); status = A1->releaseXferReq(reqh1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to release Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release Xfer Req", agent1); status = A1->releaseXferReq(reqh2); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to release Xfer Req2", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release Xfer Req2", agent1); // Commented out to test auto deregistration // status = A1->deregisterMem(mem_list1, &extra_params1); // assert (status == NIXL_SUCCESS); status = A2->deregisterMem(mem_list2, &extra_params2); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to deregister memory", agent2); + CHECK_NIXL_ERROR_AGENT(status, "Failed to deregister memory", agent2); for(int i = 0; ireleasedDlistH(src_side[i]); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to release Dlist handle", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release src Dlist handle", agent1); status = A1->releasedDlistH(dst_side[i]); - assert (status == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release dst Dlist handle", agent1); } free(src_buf); @@ -236,12 +209,9 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Register memory for each update for (int update = 0; update < NUM_UPDATES; update++) { status = A1->registerMem(src_mem_lists[update], &extra_params1); - assert(status == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(status, "Failed to register memory", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent1); status = A2->registerMem(dst_mem_lists[update], &extra_params2); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to register memory", agent2); + CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent2); } // Test metadata update with only backends and empty descriptor list @@ -250,37 +220,30 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Agent2 might have already been previously loaded. // Invalidate it just in case but don't care either way. status = A1->invalidateRemoteMD(agent2); - CHECK_NIXL_ERROR(status, "Failed to invalidate remote MD", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to invalidate remote MD", agent1); nixl_reg_dlist_t empty_dlist(DRAM_SEG); std::string partial_meta; status = A2->getLocalPartialMD(empty_dlist, partial_meta, NULL); - assert(status == NIXL_SUCCESS); - assert(partial_meta.size() > 0); - CHECK_NIXL_ERROR(status, "Failed to get local partial MD", agent2); - CHECK_NIXL_ERROR((partial_meta.size() <= 0), "Incorrect local partial MD", agent2); + CHECK_NIXL_ERROR_AGENT(status, "Failed to get local partial MD", agent2); + CHECK_NIXL_ERROR_AGENT((partial_meta.size() <= 0), "Incorrect local partial MD", agent2); std::string remote_name; status = A1->loadRemoteMD(partial_meta, remote_name); - assert(status == NIXL_SUCCESS); - assert(remote_name == agent2); - CHECK_NIXL_ERROR(status, "Failed to get load remote MD", agent1); - CHECK_NIXL_ERROR((remote_name != agent2), "Incorrect remote MD", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to get load remote MD", agent1); + CHECK_NIXL_ERROR_AGENT((remote_name != agent2), "Incorrect remote MD", agent1); // Make sure unregistered descriptors are not updated for (int update = 0; update < NUM_UPDATES; update++) { nixlDlistH *dst_side; status = A1->prepXferDlist(agent2, dst_mem_lists[update].trim(), dst_side, &extra_params1); - assert(status != NIXL_SUCCESS); - assert(dst_side == nullptr); - CHECK_NIXL_ERROR((status == NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); - CHECK_NIXL_ERROR((dst_side != nullptr), "Dst side is not null", agent1); + CHECK_NIXL_ERROR_AGENT((status == NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); + CHECK_NIXL_ERROR_AGENT((dst_side != nullptr), "Dst side is not null", agent1); } // Invalidate remote agent metadata to make sure we received connection info status = A1->invalidateRemoteMD(agent2); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to get invalidate remote MD", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to get invalidate remote MD", agent1); std::cout << "Metadata update - backends only completed\n"; // Main test loop - update metadata multiple times @@ -293,33 +256,25 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 std::cout << "Metadata update #" << update << "\n"; // Get partial metadata from A2 status = A2->getLocalPartialMD(dst_mem_lists[update], partial_meta, &extra_params2); - assert(status == NIXL_SUCCESS); - assert(partial_meta.size() > 0); - CHECK_NIXL_ERROR(status, "Failed to get local partial MD", agent2); - CHECK_NIXL_ERROR((partial_meta.size() <= 0), "Incorrect local partial MD", agent2); + CHECK_NIXL_ERROR_AGENT(status, "Failed to get local partial MD", agent2); + CHECK_NIXL_ERROR_AGENT((partial_meta.size() <= 0), "Incorrect local partial MD", agent2); // Load the partial metadata into A1 std::string remote_name; status = A1->loadRemoteMD(partial_meta, remote_name); - assert(status == NIXL_SUCCESS); - assert(remote_name == agent2); - CHECK_NIXL_ERROR(status, "Failed to load remote MD", agent1); - CHECK_NIXL_ERROR((remote_name != agent2), "Incorrect remote MD", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to load remote MD", agent1); + CHECK_NIXL_ERROR_AGENT((remote_name != agent2), "Incorrect remote MD", agent1); // Make sure loaded descriptors are updated nixlDlistH *dst_side; status = A1->prepXferDlist(agent2, dst_mem_lists[update].trim(), dst_side, &extra_params1); - assert(status == NIXL_SUCCESS); - assert(dst_side != nullptr); - CHECK_NIXL_ERROR(status, "Failed to prep xfer dlist", agent1); - CHECK_NIXL_ERROR((dst_side == nullptr), "Dst side is null", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to prep xfer dlist", agent1); + CHECK_NIXL_ERROR_AGENT((dst_side == nullptr), "Dst side is null", agent1); // Make sure not-loaded descriptors are not updated for (int invalid_idx = update + 1; invalid_idx < NUM_UPDATES; invalid_idx++) { status = A1->prepXferDlist(agent2, dst_mem_lists[invalid_idx].trim(), dst_side, &extra_params1); - assert(status != NIXL_SUCCESS); - assert(dst_side == nullptr); - CHECK_NIXL_ERROR(status, "Prep xfer dlist should not be successful", agent1); - CHECK_NIXL_ERROR((dst_side != nullptr), "Dst side is not null", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Prep xfer dlist should not be successful", agent1); + CHECK_NIXL_ERROR_AGENT((dst_side != nullptr), "Dst side is not null", agent1); } std::cout << "Metadata update #" << update << " completed\n"; } @@ -341,13 +296,11 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 nixlDlistH *src_side, *dst_side; status = A1->prepXferDlist(NIXL_INIT_AGENT, src_xfer_list, src_side, &extra_params1); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to prep xfer dlist", agent1); - CHECK_NIXL_ERROR((src_side == nullptr), "Src side is null", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to prep xfer dlist", agent1); + CHECK_NIXL_ERROR_AGENT((src_side == nullptr), "Src side is null", agent1); status = A1->prepXferDlist(agent2, dst_xfer_list, dst_side, &extra_params1); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to prep xfer dlist", agent1); - CHECK_NIXL_ERROR((dst_side == nullptr), "Dst side is null", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to prep xfer dlist", agent1); + CHECK_NIXL_ERROR_AGENT((dst_side == nullptr), "Dst side is null", agent1); std::cout << "Transfer preparation completed\n"; @@ -363,15 +316,13 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Create and post the transfer request status = A1->makeXferReq(NIXL_WRITE, src_side, indices, dst_side, indices, req, &extra_params1); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to make xfer req", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to make xfer req", agent1); nixl_status_t xfer_status = A1->postXferReq(req); // Wait for transfer completion while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req); - assert (xfer_status >= 0); - CHECK_NIXL_ERROR(xfer_status, "Failed to get xfer status", agent1); + CHECK_NIXL_ERROR_AGENT(xfer_status, "Failed to get xfer status", agent1); } // Verify transfer results @@ -385,22 +336,17 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Cleanup status = A1->releaseXferReq(req); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to release xfer req", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer req", agent1); status = A1->releasedDlistH(src_side); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to release xfer dlist", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer dlist", agent1); status = A1->releasedDlistH(dst_side); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to release xfer dlist", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer dlist", agent1); // Deregister memory for (int update = 0; update < NUM_UPDATES; update++) { status = A1->deregisterMem(src_mem_lists[update], &extra_params1); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to deregister memory", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to deregister memory", agent1); status = A2->deregisterMem(dst_mem_lists[update], &extra_params2); - assert(status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to deregister memory", agent2); + CHECK_NIXL_ERROR_AGENT(status, "Failed to deregister memory", agent2); } // Free allocated memory @@ -425,11 +371,8 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl extra_params1.backends.push_back(src_backend); extra_params2.backends.push_back(dst_backend); - assert (status == NIXL_SUCCESS); - assert (src_backend); - - CHECK_NIXL_ERROR(status, "Failed to query xfer backend", agent1); - CHECK_NIXL_ERROR((src_backend == nullptr), "Incorrect src backend handle", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to query xfer backend", agent1); + CHECK_NIXL_ERROR_AGENT((src_backend == nullptr), "Incorrect src backend handle", agent1); std::cout << "Got backend\n"; @@ -464,35 +407,27 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl dst_list = mem_list2.trim(); status = A1->registerMem(mem_list1, &extra_params1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to register memory", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent1); status = A2->registerMem(mem_list2, &extra_params2); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to register memory", agent2); + CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent2); std::string meta2; status = A2->getLocalMD(meta2); - assert (status == NIXL_SUCCESS); - assert (meta2.size() > 0); - CHECK_NIXL_ERROR(status, "Failed to get local MD", agent2); - CHECK_NIXL_ERROR((meta2.size() <= 0), "Incorrect local MD", agent2); + CHECK_NIXL_ERROR_AGENT(status, "Failed to get local MD", agent2); + CHECK_NIXL_ERROR_AGENT((meta2.size() <= 0), "Incorrect local MD", agent2); std::string remote_name; status = A1->loadRemoteMD(meta2, remote_name); - assert (status == NIXL_SUCCESS); - assert (remote_name == agent2); - CHECK_NIXL_ERROR(status, "Failed to load remote MD", agent1); - CHECK_NIXL_ERROR((remote_name != agent2), "Incorrect remote MD", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to load remote MD", agent1); + CHECK_NIXL_ERROR_AGENT((remote_name != agent2), "Incorrect remote MD", agent1); std::cout << "Ready to prepare side\n"; nixlDlistH *src_side, *dst_side; status = A1->prepXferDlist(NIXL_INIT_AGENT, src_list, src_side, &extra_params1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to prep xfer dlist", agent1); - CHECK_NIXL_ERROR((src_side == nullptr), "Src side is null", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to prep xfer dlist", agent1); + CHECK_NIXL_ERROR_AGENT((src_side == nullptr), "Src side is null", agent1); status = A1->prepXferDlist(remote_name, dst_list, dst_side, &extra_params1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to prep xfer dlist", agent1); - CHECK_NIXL_ERROR((dst_side == nullptr), "Dst side is null", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to prep xfer dlist", agent1); + CHECK_NIXL_ERROR_AGENT((dst_side == nullptr), "Dst side is null", agent1); std::cout << "prep done, starting transfers\n"; std::vector indices1, indices2; @@ -509,14 +444,12 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl //write first half of src_bufs to dst_bufs status = A1->makeXferReq(NIXL_WRITE, src_side, indices1, dst_side, indices1, req1, &extra_params1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to make xfer req", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to make xfer req", agent1); nixl_status_t xfer_status = A1->postXferReq(req1); while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req1); - assert (xfer_status >= 0); - CHECK_NIXL_ERROR(xfer_status, "Failed to get xfer status", agent1); + CHECK_NIXL_ERROR_AGENT(xfer_status, "Failed to get xfer status", agent1); } for(int i = 0; i<(n_bufs/2); i++) @@ -526,14 +459,12 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl //read first half of dst_bufs back to second half of src_bufs status = A1->makeXferReq(NIXL_READ, src_side, indices2, dst_side, indices1, req2, &extra_params1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to make xfer req", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to make xfer req", agent1); xfer_status = A1->postXferReq(req2); while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req2); - assert (xfer_status >= 0); - CHECK_NIXL_ERROR(xfer_status, "Failed to get xfer status", agent1); + CHECK_NIXL_ERROR_AGENT(xfer_status, "Failed to get xfer status", agent1); } for(int i = (n_bufs/2); imakeXferReq(NIXL_WRITE, src_side, indices2, dst_side, indices2, req3, &extra_params1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to make xfer req", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to make xfer req", agent1); xfer_status = A1->postXferReq(req3); while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req3); - assert (xfer_status >= 0); - CHECK_NIXL_ERROR(xfer_status, "Failed to get xfer status", agent1); + CHECK_NIXL_ERROR_AGENT(xfer_status, "Failed to get xfer status", agent1); } for(int i = (n_bufs/2); ireleaseXferReq(req1); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to release xfer req", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer req", agent1); status = A1->releaseXferReq(req2); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to release xfer req2", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer req2", agent1); status = A1->releaseXferReq(req3); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to release xfer req3", agent1); - assert(status == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer req3", agent1); // Commented out to test auto deregistration // status = A1->deregisterMem(mem_list1, &extra_params1); @@ -576,11 +501,9 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl // assert (status == NIXL_SUCCESS); status = A1->releasedDlistH(src_side); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to release xfer src dlist", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer src dlist", agent1); status = A1->releasedDlistH(dst_side); - assert (status == NIXL_SUCCESS); - CHECK_NIXL_ERROR(status, "Failed to release xfer dst dlist", agent1); + CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer dst dlist", agent1); for(int i = 0; i plugins; ret1 = A1.getAvailPlugins(plugins); - assert (ret1 == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get available plugins", agent1); - CHECK_NIXL_ERROR(ret1, "Failed to get available plugins", agent1); std::cout << "Available plugins:\n"; for (nixl_backend_t b: plugins) std::cout << b << "\n"; +<<<<<<< HEAD std::cout << "Using backend: " << backend << "\n"; ret1 = A1.getPluginParams(backend, mems1, init1); ret2 = A2.getPluginParams(backend, mems2, init2); +======= + ret1 = A1.getPluginParams("UCX", mems1, init1); + ret2 = A2.getPluginParams("UCX", mems2, init2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get plugin params for UCX", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get plugin params for UCX", agent2); +>>>>>>> 5638c578 (Address comments) - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to get plugin params for UCX", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to get plugin params for UCX", agent2); std::cout << "Params before init:\n"; printParams(init1, mems1); printParams(init2, mems2); @@ -663,6 +587,7 @@ main(int argc, char **argv) { ret2 = A2.createBackend(backend, init2, bknd2); nixl_opt_args_t extra_params1, extra_params2; +<<<<<<< HEAD extra_params1.backends.push_back(bknd1); extra_params2.backends.push_back(bknd2); @@ -676,6 +601,17 @@ main(int argc, char **argv) { assert (ret1 == NIXL_SUCCESS); assert (ret2 == NIXL_SUCCESS); +======= + extra_params1.backends.push_back(ucx1); + extra_params2.backends.push_back(ucx2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create UCX backend", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to create UCX backend", agent2); + + ret1 = A1.getBackendParams(ucx1, mems1, init1); + ret2 = A2.getBackendParams(ucx2, mems2, init2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get UCX backend params", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get UCX backend params", agent2); +>>>>>>> 5638c578 (Address comments) std::cout << "Params after init:\n"; printParams(init1, mems1); @@ -685,9 +621,6 @@ main(int argc, char **argv) { // ret1 = A1->makeConnection(agent2, 0); // ret2 = A2->makeConnection(agent1, 1); - // assert (ret1 == NIXL_SUCCESS); - // assert (ret2 == NIXL_SUCCESS); - // User allocates memories, and passes the corresponding address // and length to register with the backend nixlBlobDesc buff1, buff2, buff3; @@ -720,34 +653,23 @@ main(int argc, char **argv) { ret1 = A1.registerMem(dlist1, &extra_params1); ret2 = A2.registerMem(dlist2, &extra_params2); - - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to register memory", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to register memory", agent2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to register memory", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to register memory", agent2); std::string meta1; ret1 = A1.getLocalMD(meta1); std::string meta2; ret2 = A2.getLocalMD(meta2); - - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to get local MD", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to get local MD", agent2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get local MD", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get local MD", agent2); std::cout << "Agent1's Metadata: " << meta1 << "\n"; std::cout << "Agent2's Metadata: " << meta2 << "\n"; ret1 = A1.loadRemoteMD (meta2, ret_s1); ret2 = A2.loadRemoteMD (meta1, ret_s2); - - CHECK_NIXL_ERROR(ret1, "Failed to load remote MD", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to load remote MD", agent2); - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to load remote MD", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to load remote MD", agent2); size_t req_size = 8; size_t dst_offset = 8; @@ -779,8 +701,7 @@ main(int argc, char **argv) { extra_params1.notifMsg = "notification"; extra_params1.hasNotif = true; ret1 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_dst_descs, agent2, req_handle, &extra_params1); - assert (ret1 == NIXL_SUCCESS); - CHECK_NIXL_ERROR(ret1, "Failed to create Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create Xfer Req", agent1); nixl_status_t status = A1.postXferReq(req_handle); @@ -792,42 +713,34 @@ main(int argc, char **argv) { while (status != NIXL_SUCCESS || n_notifs == 0) { if (status != NIXL_SUCCESS) status = A1.getXferStatus(req_handle); if (n_notifs == 0) ret2 = A2.getNotifs(notif_map); - assert (status >= 0); - assert (ret2 == NIXL_SUCCESS); - CHECK_NIXL_ERROR((status < 0), "Failed to post Xfer Req", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to get notifs", agent2); + CHECK_NIXL_ERROR_AGENT((status < 0), "Failed to post Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get notifs", agent2); n_notifs = notif_map.size(); } std::vector agent1_notifs = notif_map[agent1]; - assert (agent1_notifs.size() == 1); - assert (agent1_notifs.front() == "notification"); - CHECK_NIXL_ERROR((agent1_notifs.size() != 1), "Incorrect notif size", agent1); - CHECK_NIXL_ERROR((agent1_notifs.front() != "notification"), "Incorrect notification", agent1); + CHECK_NIXL_ERROR_AGENT((agent1_notifs.size() != 1), "Incorrect notif size", agent1); + CHECK_NIXL_ERROR_AGENT((agent1_notifs.front() != "notification"), "Incorrect notification", agent1); notif_map[agent1].clear(); // Redundant, for testing notif_map.clear(); n_notifs = 0; std::cout << "Transfer verified\n"; - std::cout << "performing partialMdTest with backends " << bknd1 << " " << bknd2 << "\n"; - ret1 = partialMdTest(&A1, &A2, bknd1, bknd2); - assert (ret1 == NIXL_SUCCESS); - CHECK_NIXL_ERROR(ret1, "Fail to run partialMDTest", agent1); + std::cout << "performing partialMdTest with backends " << ucx1 << " " << ucx2 << "\n"; + ret1 = partialMdTest(&A1, &A2, ucx1, ucx2); + CHECK_NIXL_ERROR_AGENT(ret1, "Fail to run partialMDTest", agent1); - std::cout << "performing sideXferTest with backends " << bknd1 << " " << bknd2 << "\n"; - ret1 = sideXferTest(&A1, &A2, req_handle, bknd2); - assert (ret1 == NIXL_SUCCESS); - CHECK_NIXL_ERROR(ret1, "Fail to run sideXferTest", agent1); + std::cout << "performing sideXferTest with backends " << ucx1 << " " << ucx2 << "\n"; + ret1 = sideXferTest(&A1, &A2, req_handle, ucx2); + CHECK_NIXL_ERROR_AGENT(ret1, "Fail to run sideXferTest", agent1); std::cout << "Performing local test\n"; extra_params1.notifMsg = "local_notif"; extra_params1.hasNotif = true; ret2 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_ldst_descs, agent1, req_handle2, &extra_params1); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to create Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT (ret1, "Failed to create Xfer Req", agent1); status = A1.postXferReq(req_handle2); std::cout << "Local transfer was posted\n"; @@ -835,46 +748,33 @@ main(int argc, char **argv) { while (status != NIXL_SUCCESS || n_notifs == 0) { if (status != NIXL_SUCCESS) status = A1.getXferStatus(req_handle2); if (n_notifs == 0) ret2 = A1.getNotifs(notif_map); - assert (status >= 0); - assert (ret2 == NIXL_SUCCESS); - CHECK_NIXL_ERROR((status < 0), "Failed to post Xfer Req", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to get notifs", agent2); + CHECK_NIXL_ERROR_AGENT((status < 0), "Failed to post Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get notifs", agent2); n_notifs = notif_map.size(); } agent1_notifs = notif_map[agent1]; - assert (agent1_notifs.size() == 1); - assert (agent1_notifs.front() == "local_notif"); - assert (equal_buf((void*) req_src.addr, (void*) req_ldst.addr, req_size) == true); - CHECK_NIXL_ERROR((agent1_notifs.size() != 1), "Incorrect notif size", agent1); - CHECK_NIXL_ERROR((agent1_notifs.front() != "local_notif"), "Incorrect notification", agent1); - CHECK_NIXL_ERROR((!equal_buf((void *)req_src.addr, (void *)req_ldst.addr, req_size) == true), + CHECK_NIXL_ERROR_AGENT((agent1_notifs.size() != 1), "Incorrect notif size", agent1); + CHECK_NIXL_ERROR_AGENT((agent1_notifs.front() != "local_notif"), "Incorrect notification", agent1); + CHECK_NIXL_ERROR_AGENT((!equal_buf((void *)req_src.addr, (void *)req_ldst.addr, req_size) == true), "Buffer mismatch after transfer", agent1); ret1 = A1.releaseXferReq(req_handle); ret2 = A1.releaseXferReq(req_handle2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to release Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to release Xfer Req2", agent1); - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to release Xfer Req", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to release Xfer Req2", agent1); ret1 = A1.deregisterMem(dlist1, &extra_params1); ret2 = A2.deregisterMem(dlist2, &extra_params2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to deregister memory", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to deregister memory", agent2); - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to deregister memory", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to deregister memory", agent2); //only initiator should call invalidate ret1 = A1.invalidateRemoteMD(agent2); //A2.invalidateRemoteMD(agent1); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to invalidate remote MD", agent1); - assert (ret1 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to invalidate remote MD", agent1); free(addr1); free(addr2); free(addr3); diff --git a/test/nixl/nixl_test.cpp b/test/nixl/nixl_test.cpp index adb99393a..5f2eb609d 100644 --- a/test/nixl/nixl_test.cpp +++ b/test/nixl/nixl_test.cpp @@ -22,7 +22,7 @@ #include #include #include -#include +#include "common/util.h" #include "stream/metadata_stream.h" #include "serdes/serdes.h" #include @@ -84,12 +84,7 @@ static void targetThread(nixlAgent &agent, nixl_opt_args_t *extra_params, int th /** Only send desc list */ nixlSerDes serdes; nixl_status_t st = dram_for_ucx.trim().serialize(&serdes); - assert(st == NIXL_SUCCESS); - - if (st != NIXL_SUCCESS) { - std::cerr << "Failed to serialize registry dlist " << st << std::endl; - exit(EXIT_FAILURE); - } + CHECK_NIXL_ERROR(st, "Failed to serialize registry dlist"); std::cout << "Thread " << thread_id << " Wait for initiator and then send xfer descs\n"; std::string message = serdes.exportStr(); @@ -149,12 +144,7 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params, nixl_notifs_t notifs; nixl_status_t ret = agent.getNotifs(notifs, extra_params); - assert(ret >= 0); - - if (ret < 0) { - std::cerr << "Failed to get notifs, status: " << ret << std::endl; - exit(EXIT_FAILURE); - } + CHECK_NIXL_ERROR(ret, "Failed to get notifs"); if (notifs.size() > 0) { std::lock_guard lock(shared_state.mtx); @@ -204,12 +194,7 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params, while (ret != NIXL_SUCCESS) { ret = agent.getXferStatus(treq); - assert(ret >= 0); - } - - if (ret != NIXL_SUCCESS) { - std::cerr << "Thread " << thread_id << " Error getting transfer status " << ret << "\n"; - exit(-1); + CHECK_NIXL_ERROR(ret, "Failed to get transfer status"); } std::cout << "Thread " << thread_id << " Completed Sending Data using UCX backend\n"; diff --git a/test/unit/plugins/gpunetio/meson.build b/test/unit/plugins/gpunetio/meson.build index b6ad72fdb..ddaf928d3 100644 --- a/test/unit/plugins/gpunetio/meson.build +++ b/test/unit/plugins/gpunetio/meson.build @@ -19,6 +19,7 @@ if cuda_dep.found() cuda_dependencies = [cuda_dep] compile_flags += '-DHAVE_CUDA' nvtx_dep = nvcc.find_library('nvToolsExt', dirs: '/usr/local/cuda/lib64', required: false) + dl_dep = dependency('dl', required: true) if nvtx_dep.found() compile_flags += '-DUSE_NVTX' else @@ -26,7 +27,7 @@ if cuda_dep.found() endif nixl_gpunetio_stream_app = executable ('nixl_gpunetio_stream_test', 'nixl_gpunetio_stream_test.cu', - dependencies: [nixl_dep, nixl_infra, stream_interface] + cuda_dep + nvtx_dep, + dependencies: [nixl_dep, nixl_infra, stream_interface] + cuda_dep + nvtx_dep + dl_dep, include_directories: [nixl_inc_dirs, utils_inc_dirs, '../../../../src/utils/serdes'], cpp_args: compile_flags, cuda_args: compile_flags, diff --git a/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu b/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu index 0b368e4af..afd6b18b2 100644 --- a/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu +++ b/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu @@ -21,7 +21,7 @@ #include #include #include -#include +#include "common/util.h" #include "stream/metadata_stream.h" #include "serdes/serdes.h" @@ -300,10 +300,10 @@ main (int argc, char *argv[]) { /** Register memory in both initiator and target */ ret = agent.registerMem (local_vram_rdlist, &extra_params); - assert (ret == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(ret, "Failed to register memory", role); local_vram = local_vram_rdlist.trim(); ret = agent.getLocalMD(metadata); - assert(ret == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(ret, "Failed to get local MD", role); std::cout << " Start Control Path metadata exchanges \n"; if (role == target) { @@ -316,11 +316,10 @@ main (int argc, char *argv[]) { std::cout << " Received checkRemoteMD from " << initiator << std::endl; data_address_ptr = (uintptr_t)data_address; - assert (serdes->addBuf ("BaseAddress", &data_address_ptr, sizeof (uintptr_t)) == - NIXL_SUCCESS); - assert (serdes->addBuf ("BufferSize", &buf_size, sizeof (size_t)) == NIXL_SUCCESS); - assert (serdes->addBuf ("BufferTransfer", &buf_num, sizeof (uint32_t)) == NIXL_SUCCESS); - assert (serdes->addStr ("AgentMD", metadata) == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(serdes->addBuf ("BaseAddress", &data_address_ptr, sizeof (uintptr_t)), "Failed to add BaseAddress", role); + CHECK_NIXL_ERROR_AGENT(serdes->addBuf ("BufferSize", &buf_size, sizeof (size_t)), "Failed to add BufferSize", role); + CHECK_NIXL_ERROR_AGENT(serdes->addBuf ("BufferTransfer", &buf_num, sizeof (uint32_t)), "Failed to add BufferTransfer", role); + CHECK_NIXL_ERROR_AGENT(serdes->addStr ("AgentMD", metadata), "Failed to add AgentMD", role); std::string message = serdes->exportStr(); while (agent.genNotif (initiator, message, &extra_params) != NIXL_SUCCESS) ; @@ -414,9 +413,9 @@ main (int argc, char *argv[]) { md_extra_params.port = peer_port; ret = agent.fetchRemoteMD (target, &md_extra_params); - assert (ret == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(ret, "Failed to fetch remote MD", role); ret = agent.sendLocalMD (&md_extra_params); - assert (ret == NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(ret, "Failed to send local MD", role); // Not used nixl_xfer_dlist_t descs (DRAM_SEG); std::cout << initiator << " waiting checkRemoteMD from " << target << std::endl; @@ -430,14 +429,11 @@ main (int argc, char *argv[]) { for (const auto ¬if : notifs[target]) { remote_serdes->importStr (notif); - assert (remote_serdes->getBuf ("BaseAddress", &data_address_ptr, sizeof (uintptr_t)) == - NIXL_SUCCESS); - assert (remote_serdes->getBuf ("BufferSize", &buf_size, sizeof (size_t)) == - NIXL_SUCCESS); - assert (remote_serdes->getBuf ("BufferTransfer", &buf_num, sizeof (uint32_t)) == - NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(remote_serdes->getBuf ("BaseAddress", &data_address_ptr, sizeof (uintptr_t)), "Failed to get BaseAddress", role); + CHECK_NIXL_ERROR_AGENT(remote_serdes->getBuf ("BufferSize", &buf_size, sizeof (size_t)), "Failed to get BufferSize", role); + CHECK_NIXL_ERROR_AGENT(remote_serdes->getBuf ("BufferTransfer", &buf_num, sizeof (uint32_t)), "Failed to get BufferTransfer", role); remote_metadata = remote_serdes->getStr ("AgentMD"); - assert (remote_metadata != ""); + CHECK_NIXL_ERROR_AGENT((remote_metadata != ""), "Failed to get AgentMD", role); agent.loadRemoteMD (remote_metadata, target); } notifs.clear(); @@ -497,14 +493,15 @@ main (int argc, char *argv[]) { std::cout << "Post the request with GPUNETIO backend transfer 1" << std::endl; PUSH_RANGE ("postXferReq", 3) status = agent.postXferReq (treq); - assert (status == NIXL_IN_PROG); + CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), "Failed to post Xfer Req", role); + POP_RANGE std::cout << "Waiting for completion to re-use buffers\n"; PUSH_RANGE ("getXferStatus", 4) while (status != NIXL_SUCCESS) { status = agent.getXferStatus (treq); - assert (status == NIXL_SUCCESS || status == NIXL_IN_PROG); + CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), "Failed to get Xfer Status", role); } POP_RANGE // No need for cudaStreamSyncronize as CUDA kernel and Xfer are on the same stream @@ -533,14 +530,14 @@ main (int argc, char *argv[]) { std::cout << "Post the request with GPUNETIO backend transfer 2" << std::endl; PUSH_RANGE ("postXferReq", 3) status = agent.postXferReq (treq); - assert (status >= NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to post Xfer Req", role); POP_RANGE std::cout << "Waiting for completion\n"; PUSH_RANGE ("getXferStatus", 4) while (status != NIXL_SUCCESS) { status = agent.getXferStatus (treq); - assert (status >= NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), "Failed to get Xfer Status", role); } POP_RANGE } else { @@ -553,14 +550,14 @@ main (int argc, char *argv[]) { std::cout << "Post the request with GPUNETIO backend transfer 1" << std::endl; PUSH_RANGE ("postXferReq", 3) status = agent.postXferReq (treq); - assert (status >= NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to post Xfer Req", role); POP_RANGE std::cout << "Waiting for completion\n"; PUSH_RANGE ("getXferStatus", 4) while (status != NIXL_SUCCESS) { status = agent.getXferStatus (treq); - assert (status >= NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), "Failed to get Xfer Status", role); } POP_RANGE @@ -589,14 +586,14 @@ main (int argc, char *argv[]) { std::cout << "Post the request with GPUNETIO backend transfer 2" << std::endl; PUSH_RANGE ("postXferReq", 3) status = agent.postXferReq (treq); - assert (status >= NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to post Xfer Req", role); POP_RANGE std::cout << "Waiting for completion\n"; PUSH_RANGE ("getXferStatus", 4) while (status != NIXL_SUCCESS) { status = agent.getXferStatus (treq); - assert (status >= NIXL_SUCCESS); + CHECK_NIXL_ERROR_AGENT(status, "Failed to get Xfer Status", role); } POP_RANGE } diff --git a/test/unit/plugins/ucx/ucx_backend_multi.cpp b/test/unit/plugins/ucx/ucx_backend_multi.cpp index e00bc047b..32e387875 100644 --- a/test/unit/plugins/ucx/ucx_backend_multi.cpp +++ b/test/unit/plugins/ucx/ucx_backend_multi.cpp @@ -15,19 +15,11 @@ * limitations under the License. */ #include -#include #include #include "ucx_backend.h" +#include "common/util.h" -#define CHECK_NIXL_ERROR(result, message, agent) \ - do { \ - if (0 != result) { \ - std::cerr << "NIXL: " << message << " for agent " << agent \ - << " (Error code: " << result << ")" << std::endl; \ - exit(EXIT_FAILURE); \ - } \ - } while (0) // Temporarily while fixing CI/CD pipeline #define USE_PTHREAD false @@ -69,15 +61,13 @@ void test_thread(int id) while(!ready[!id]); ret = ucx->loadRemoteConnInfo(other, conn_info[!id]); - assert(ret == NIXL_SUCCESS); - CHECK_NIXL_ERROR(ret, "Failed to load remote conn info", my_name); + CHECK_NIXL_ERROR_AGENT(ret, "Failed to load remote conn info", my_name); //one-sided connect if(!id) ret = ucx->connect(other); - assert(ret == NIXL_SUCCESS); - CHECK_NIXL_ERROR(ret, "Failed to connect", my_name); + CHECK_NIXL_ERROR_AGENT(ret, "Failed to connect", my_name); done[id] = true; while(!done[!id]) diff --git a/test/unit/plugins/ucx/ucx_backend_test.cpp b/test/unit/plugins/ucx/ucx_backend_test.cpp index 509096107..52eb9a2dd 100644 --- a/test/unit/plugins/ucx/ucx_backend_test.cpp +++ b/test/unit/plugins/ucx/ucx_backend_test.cpp @@ -17,20 +17,12 @@ #include #include #include -#include #include "ucx_backend.h" +#include "common/util.h" using namespace std; -#define CHECK_NIXL_ERROR(result, message) \ - do { \ - if (0 != result) { \ - std::cerr << "NIXL: " << message << " (Error code: " << result << ")" << std::endl; \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - #ifdef HAVE_CUDA @@ -72,7 +64,7 @@ class testHndlIterator { ~testHndlIterator() { /* Make sure that handler was released */ - assert(!set); + CHECK_NIXL_ERROR(set, "Handler was not released"); } bool needPrep() { @@ -96,7 +88,7 @@ class testHndlIterator { void setHandle(nixlBackendReqH *_handle) { - assert(!set); + CHECK_NIXL_ERROR(set, "Handler was not released"); handle = _handle; set = true; if (reuse) { @@ -105,12 +97,12 @@ class testHndlIterator { } void unsetHandle() { - assert(set); + CHECK_NIXL_ERROR(!set, "Handler was not set"); set = false; } nixlBackendReqH *&getHandle() { - assert(set); + CHECK_NIXL_ERROR(!set, "Handler was not set"); return handle; } }; @@ -127,11 +119,7 @@ createEngine(std::string name, bool p_thread) { init.type = "UCX"; auto ucx = nixlUcxEngine::create(init).release(); - assert(!ucx->getInitErr()); - if (ucx->getInitErr()) { - std::cout << "Failed to initialize worker1" << std::endl; - exit(1); - } + CHECK_NIXL_ERROR(ucx->getInitErr(), "Failed to initialize worker1"); return ucx; } @@ -153,7 +141,6 @@ std::string memType2Str(nixl_mem_t mem_type) case FILE_SEG: return std::string("FILE"); default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -212,10 +199,8 @@ void allocateBuffer(nixl_mem_t mem_type, int dev_id, size_t len, void* &addr) } #endif default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } - assert(addr); CHECK_NIXL_ERROR((addr == nullptr), "Failed to allocate buffer"); } @@ -232,7 +217,6 @@ void releaseBuffer(nixl_mem_t mem_type, int dev_id, void* &addr) break; #endif default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -250,7 +234,6 @@ void doMemset(nixl_mem_t mem_type, int dev_id, void *addr, char byte, size_t len break; #endif default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -269,7 +252,6 @@ void *getValidationPtr(nixl_mem_t mem_type, void *addr, size_t len) } #endif default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -285,7 +267,6 @@ void *releaseValidationPtr(nixl_mem_t mem_type, void *addr) break; #endif default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } return NULL; @@ -304,8 +285,7 @@ allocateWrongGPUTest(nixlUcxEngine *ucx, int dev_id) { int ret = ucx->registerMem(desc, VRAM_SEG, md); - assert(ret == NIXL_ERR_NOT_SUPPORTED); - CHECK_NIXL_ERROR((ret != NIXL_ERR_NOT_SUPPORTED), "Failed to register memory"); + CHECK_NIXL_ERROR_AGENT(ret, "Failed to register memory", "test"); releaseBuffer(VRAM_SEG, dev_id, buf); } @@ -327,7 +307,6 @@ allocateAndRegister(nixlUcxEngine *ucx, int ret = ucx->registerMem(desc, mem_type, md); - assert(ret == NIXL_SUCCESS); CHECK_NIXL_ERROR(ret, "Failed to register memory"); } @@ -356,12 +335,10 @@ loadRemote(nixlUcxEngine *ucx, info.devId = dev_id; ucx->getPublicData(lmd, info.metaInfo); - assert(info.metaInfo.size() > 0); CHECK_NIXL_ERROR((info.metaInfo.size() == 0), "Failed to get public data"); // We get the data from the cetnral location and populate the backend, and receive remote_meta int ret = ucx->loadRemoteMD (info, mem_type, agent, rmd); - assert(NIXL_SUCCESS == ret); CHECK_NIXL_ERROR(ret, "Failed to load remote MD"); } @@ -425,13 +402,11 @@ performTransfer(nixlUcxEngine *ucx1, if (hiter.needPrep()) { nixlBackendReqH *new_handle = nullptr; ret3 = ucx1->prepXfer(op, req_src_descs, req_dst_descs, remote_agent, new_handle, &opt_args); - assert(ret3 == NIXL_SUCCESS); CHECK_NIXL_ERROR(ret3, "Failed to prep xfer"); hiter.setHandle(new_handle); } nixlBackendReqH *&handle = hiter.getHandle(); ret3 = ucx1->postXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); - assert( ret3 == NIXL_SUCCESS || ret3 == NIXL_IN_PROG); CHECK_NIXL_ERROR(ret3, "Failed to post xfer"); CHECK_NIXL_ERROR(!((ret3 == NIXL_SUCCESS) || (ret3 == NIXL_IN_PROG)), "Failed to post xfer"); @@ -445,7 +420,6 @@ performTransfer(nixlUcxEngine *ucx1, if(progress){ ucx2->progress(); } - assert( ret3 == NIXL_SUCCESS || ret3 == NIXL_IN_PROG); CHECK_NIXL_ERROR(!((ret3 == NIXL_SUCCESS) || (ret3 == NIXL_IN_PROG)), "Failed to check xfer"); } @@ -469,14 +443,10 @@ performTransfer(nixlUcxEngine *ucx1, if(progress){ ucx1->progress(); } - assert(ret3 == NIXL_SUCCESS); CHECK_NIXL_ERROR(ret3, "Failed to get notifs"); } - assert(ret2 == 1); - - assert(target_notifs.front().first == "Agent1"); - assert(target_notifs.front().second == test_str); + CHECK_NIXL_ERROR((ret2 != 1), "Incorrect number of target notifs"); CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); CHECK_NIXL_ERROR((target_notifs.front().second != test_str), "Incorrect front notif message"); @@ -491,7 +461,6 @@ performTransfer(nixlUcxEngine *ucx1, // Perform correctness check. for(size_t i = 0; i < len; i++){ - assert( ((uint8_t*) chkptr1)[i] == ((uint8_t*) chkptr2)[i]); CHECK_NIXL_ERROR((((uint8_t *)chkptr1)[i] != ((uint8_t *)chkptr2)[i]), "Data mismatch"); } @@ -517,15 +486,13 @@ test_intra_agent_transfer(bool p_thread, nixlUcxEngine *ucx, nixl_mem_t mem_type int iter = 10; - assert(ucx->supportsLocal()); + CHECK_NIXL_ERROR(!ucx->supportsLocal(), "Failed to get conn info"); //connection info is still a string std::string conn_info1; ret1 = ucx->getConnInfo(conn_info1); - assert(ret1 == NIXL_SUCCESS); CHECK_NIXL_ERROR(ret1, "Failed to get conn info"); ret1 = ucx->loadRemoteConnInfo (agent1, conn_info1); - assert(ret1 == NIXL_SUCCESS); CHECK_NIXL_ERROR(ret1, "Failed to load remote conn info"); std::cout << "Local connection complete\n"; @@ -544,7 +511,6 @@ test_intra_agent_transfer(bool p_thread, nixlUcxEngine *ucx, nixl_mem_t mem_type //string descs unnecessary, convert meta locally nixlBackendMD* rmd2; ret1 = ucx->loadLocalMD (lmd2, rmd2); - assert(ret1 == NIXL_SUCCESS); CHECK_NIXL_ERROR(ret1, "Failed to load local MD"); nixl_meta_dlist_t req_src_descs (mem_type); populateDescs(req_src_descs, 0, addr1, desc_cnt, desc_size, lmd1); @@ -610,15 +576,12 @@ test_inter_agent_transfer(bool p_thread, // location and ask for it for a remote node std::string conn_info1, conn_info2; ret = ucx1->getConnInfo(conn_info1); - assert(ret == NIXL_SUCCESS); CHECK_NIXL_ERROR(ret, "Failed to get conn info"); ret = ucx2->getConnInfo(conn_info2); - assert(ret == NIXL_SUCCESS); CHECK_NIXL_ERROR(ret, "Failed to get conn info"); // We assumed we put them to central location and now receiving it on the other process ret = ucx1->loadRemoteConnInfo (agent2, conn_info2); - assert(ret == NIXL_SUCCESS); CHECK_NIXL_ERROR(ret, "Failed to load remote conn info"); // TODO: Causes race condition - investigate conn management implementation @@ -690,17 +653,12 @@ test_inter_agent_transfer(bool p_thread, while(ret == 0){ ret2 = ucx2->getNotifs(target_notifs); ret = target_notifs.size(); - assert(ret2 == NIXL_SUCCESS); CHECK_NIXL_ERROR(ret2, "Failed to get notifs"); } - assert(ret == 1); - - assert(target_notifs.front().first == "Agent1"); + CHECK_NIXL_ERROR((ret != 1), "Incorrect number of target notifs"); CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); - assert(target_notifs.front().second == test_str); - CHECK_NIXL_ERROR((target_notifs.front().second != test_str), - "Incorrect front notif message"); + CHECK_NIXL_ERROR((target_notifs.front().second != test_str), "Incorrect front notif message"); cout << "OK" << endl; } diff --git a/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp b/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp index c4cb6080e..f7a646b2f 100644 --- a/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp +++ b/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp @@ -17,19 +17,12 @@ #include #include #include -#include #include "ucx_mo_backend.h" +#include "common/util.h" using namespace std; -#define CHECK_NIXL_ERROR(result, message) \ - do { \ - if (0 != result) { \ - std::cerr << "NIXL: " << message << " (Error code: " << result << ")" << std::endl; \ - exit(EXIT_FAILURE); \ - } \ - } while (0) #ifdef HAVE_CUDA @@ -74,7 +67,6 @@ std::string memType2Str(nixl_mem_t mem_type) case FILE_SEG: return std::string("FILE"); default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -93,7 +85,6 @@ nixlBackendEngine *createEngine(std::string name, uint32_t ndev, bool p_thread) init.type = "UCX_MO"; ucx_mo = (nixlBackendEngine*) new nixlUcxMoEngine (&init); - assert(!ucx_mo->getInitErr()); CHECK_NIXL_ERROR(ucx_mo->getInitErr(), "Failed to initialize worker1"); if (ucx_mo->getInitErr()) { std::cout << "Failed to initialize worker1" << std::endl; @@ -164,10 +155,8 @@ void allocateBuffer(nixl_mem_t mem_type, int dev_id, size_t len, void* &addr) } #endif default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } - assert(addr); CHECK_NIXL_ERROR((addr == nullptr), "Failed to allocate buffer"); } @@ -184,7 +173,6 @@ void releaseBuffer(nixl_mem_t mem_type, int dev_id, void* &addr) break; #endif default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -202,7 +190,6 @@ void doMemset(nixl_mem_t mem_type, int dev_id, void *addr, char byte, size_t len break; #endif default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -221,7 +208,6 @@ void *getValidationPtr(nixl_mem_t mem_type, void *addr, size_t len) } #endif default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } } @@ -237,7 +223,6 @@ void *releaseValidationPtr(nixl_mem_t mem_type, void *addr) break; #endif default: - assert(0); CHECK_NIXL_ERROR(1, "Unsupported memory type!"); } return NULL; @@ -281,7 +266,6 @@ void createLocalDescs(nixlBackendEngine *ucx, nixl_meta_dlist_t &descs, *((nixlBasicDesc*)&desc_s) = desc; *((nixlBasicDesc*)&desc_m) = desc; int ret = ucx->registerMem(desc_s, descs.getType(), desc_m.metadataP); - assert(ret == NIXL_SUCCESS); CHECK_NIXL_ERROR(ret, "Failed to register ucx memory"); descs.addDesc(desc_m); } @@ -322,12 +306,10 @@ void createRemoteDescs(nixlBackendEngine *src_ucx, status = dst_ucx->loadLocalMD(src_descs[i].metadataP, desc_m.metadataP); } else { status = src_ucx->getPublicData(src_descs[i].metadataP, desc_s.metaInfo); - assert(NIXL_SUCCESS == status); CHECK_NIXL_ERROR(status, "Failed to get src_ucx public data"); status = dst_ucx->loadRemoteMD (desc_s, src_descs.getType(), agent, desc_m.metadataP); } - assert(status == NIXL_SUCCESS); CHECK_NIXL_ERROR(status, "Failed to load dst_ucx remote MD"); dst_descs.addDesc(desc_m); } @@ -339,7 +321,6 @@ void destroyRemoteDescs(nixlBackendEngine *dst_ucx, nixl_status_t status; for(int i = 0; i < dst_descs.descCount(); i++) { status = dst_ucx->unloadMD (dst_descs[i].metadataP); - assert(status == NIXL_SUCCESS); CHECK_NIXL_ERROR(status, "Failed to unload dst_ucx MD"); } @@ -371,10 +352,8 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, // or an ID that later can be used to check the status as a new method // Also maybe we would remove the WRITE and let the backend class decide the op status = ucx1->prepXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); - assert(status == NIXL_SUCCESS); CHECK_NIXL_ERROR(status, "Failed to prep ucx1 xfer"); status = ucx1->postXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); - assert(status == NIXL_SUCCESS || status == NIXL_IN_PROG); CHECK_NIXL_ERROR(status, "Failed to post ucx1 xfer"); @@ -388,7 +367,6 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, if(progress){ ((nixlUcxMoEngine *)ucx2)->progress(); } - assert( (NIXL_SUCCESS == status) || (NIXL_IN_PROG == status) ); CHECK_NIXL_ERROR(!((NIXL_SUCCESS == status) || (NIXL_IN_PROG == status)), "Failed to check ucx1 xfer"); } @@ -403,18 +381,14 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, while(!target_notifs.size()){ status = ucx2->getNotifs(target_notifs); - assert(NIXL_SUCCESS == status); CHECK_NIXL_ERROR(status, "Failed to get ucx2 notifs"); if(progress){ ((nixlUcxMoEngine *)ucx1)->progress(); } } - assert(target_notifs.size() == 1); CHECK_NIXL_ERROR((target_notifs.size() != 1), "Incorrect number of target notifs"); - assert(target_notifs.front().first == "Agent1"); CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); - assert(target_notifs.front().second == test_str); CHECK_NIXL_ERROR((target_notifs.front().second != test_str), "Incorrect front notif message"); @@ -423,11 +397,10 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, cout << "\t\tData verification: " << flush; - assert(req_src_descs.descCount() == req_dst_descs.descCount()); + CHECK_NIXL_ERROR((req_src_descs.descCount() != req_dst_descs.descCount()), "Data length mismatch"); for(int i = 0; i < req_src_descs.descCount(); i++) { auto sdesc = req_src_descs[i]; auto ddesc = req_dst_descs[i]; - assert(sdesc.len == ddesc.len); CHECK_NIXL_ERROR((sdesc.len != ddesc.len), "Data length mismatch"); size_t len = ddesc.len; chkptr1 = getValidationPtr(req_src_descs.getType(), (void*)sdesc.addr, len); @@ -435,7 +408,6 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, // Perform correctness check. for(size_t i = 0; i < len; i++){ - assert( ((uint8_t*) chkptr1)[i] == ((uint8_t*) chkptr2)[i]); CHECK_NIXL_ERROR((((uint8_t *)chkptr1)[i] != ((uint8_t *)chkptr2)[i]), "Data mismatch"); } @@ -477,18 +449,15 @@ void test_agent_transfer(bool p_thread, // location and ask for it for a remote node std::string conn_info1; status = ucx1->getConnInfo(conn_info1); - assert(NIXL_SUCCESS == status); CHECK_NIXL_ERROR(status, "Failed to get ucx1 conn info"); std::string conn_info2; status = ucx2->getConnInfo(conn_info2); - assert(NIXL_SUCCESS == status); CHECK_NIXL_ERROR(status, "Failed to get ucx2 conn info"); // We assumed we put them to central location and now receiving it on the other process if (is_local) { agent = &agent1; } status = ucx1->loadRemoteConnInfo (*agent, conn_info2); - assert(NIXL_SUCCESS == status); CHECK_NIXL_ERROR(status, "Failed to load ucx1 remote conn info"); // TODO: Causes race condition - investigate conn management implementation // ret = ucx2->loadRemoteConnInfo (agent1, conn_info1); @@ -550,18 +519,15 @@ void test_agent_transfer(bool p_thread, while(target_notifs.size() == 0){ status = ucx2->getNotifs(target_notifs); - assert(NIXL_SUCCESS == status); + CHECK_NIXL_ERROR(status, "Failed to get ucx2 notifs"); if (!p_thread) { /* progress UCX1 as well */ ((nixlUcxMoEngine *)ucx1)->progress(); } } - assert(target_notifs.size() == 1); CHECK_NIXL_ERROR((target_notifs.size() != 1), "Incorrect number of target notifs"); - assert(target_notifs.front().first == "Agent1"); CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); - assert(target_notifs.front().second == test_str); CHECK_NIXL_ERROR((target_notifs.front().second != test_str), "Incorrect front notif message"); diff --git a/test/unit/utils/common/map_perf.cpp b/test/unit/utils/common/map_perf.cpp index 44d3be2aa..308dd5473 100644 --- a/test/unit/utils/common/map_perf.cpp +++ b/test/unit/utils/common/map_perf.cpp @@ -15,7 +15,6 @@ * limitations under the License. */ #include -#include #include #include #include @@ -24,15 +23,7 @@ #include #include "common/str_tools.h" - -#define CHECK_NIXL_ERROR(result, message, agent) \ - do { \ - if (0 != result) { \ - std::cerr << "NIXL: " << message << " for agent " << agent \ - << " (Error code: " << result << ")" << std::endl; \ - exit(EXIT_FAILURE); \ - } \ - } while (0) +#include "common/util.h" std::string generate_random_string(size_t length) { const std::string characters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; @@ -127,7 +118,7 @@ void test_comparison_perf(const int n_entries, const size_t str_len) { std::cout << "custom map lookup test, total time for " << n_iters << " iters: " << diff_time.tv_sec << "s " << diff_time.tv_usec << "us \n"; - assert(sum1 == sum2); + CHECK_NIXL_ERROR_AGENT((sum1 != sum2), "Test failed", "test"); gettimeofday(&start_time, NULL); for(int i = 0; i Date: Wed, 8 Oct 2025 02:07:26 -0700 Subject: [PATCH 04/17] meson: Add options for building tests and examples Signed-off-by: Adit Ranadive --- meson.build | 8 ++++++-- meson_options.txt | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/meson.build b/meson.build index 658098ea3..2e20016f3 100644 --- a/meson.build +++ b/meson.build @@ -235,8 +235,12 @@ plugins_inc_dirs = include_directories('src/plugins') utils_inc_dirs = include_directories('src/utils') subdir('src') -subdir('test') -subdir('examples') +if get_option('build_tests') + subdir('test') +endif +if get_option('build_examples') + subdir('examples') +endif if get_option('install_headers') install_headers('src/api/cpp/nixl.h', install_dir: prefix_inc) diff --git a/meson_options.txt b/meson_options.txt index a316184f8..fc28c08f2 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -30,4 +30,6 @@ option('log_level', type: 'combo', choices: ['trace', 'debug', 'info', 'warning' option('rust', type: 'boolean', value: false, description: 'Build Rust bindings') # Tests +option('build_tests', type: 'boolean', value: true, description: 'Build all tests') +option('build_examples', type: 'boolean', value: true, description: 'Build all examples') option('test_all_plugins', type: 'boolean', value: false, description: 'Testing all plugins in addition to the mocks..') From 475c7817e07e9d7a58e5b5ab1e1865ebffb6f4e7 Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Wed, 8 Oct 2025 02:14:31 -0700 Subject: [PATCH 05/17] Fix clang Signed-off-by: Adit Ranadive --- src/utils/common/util.h | 13 +++--- test/nixl/agent_example.cpp | 34 +++++++++----- .../gpunetio/nixl_gpunetio_stream_test.cu | 44 ++++++++++++++----- test/unit/plugins/ucx/ucx_backend_test.cpp | 16 ++++--- .../plugins/ucx_mo/ucx_mo_backend_test.cpp | 11 ++--- 5 files changed, 76 insertions(+), 42 deletions(-) diff --git a/src/utils/common/util.h b/src/utils/common/util.h index 27d26855d..b14f759dc 100644 --- a/src/utils/common/util.h +++ b/src/utils/common/util.h @@ -30,13 +30,12 @@ } \ } while (0) -#define CHECK_NIXL_ERROR(result, message) \ - do { \ - if (0 != result) { \ - std::cerr << "NIXL: " << message << " (Error code: " \ - << result << ")" << std::endl; \ - exit(EXIT_FAILURE); \ - } \ +#define CHECK_NIXL_ERROR(result, message) \ + do { \ + if (0 != result) { \ + std::cerr << "NIXL: " << message << " (Error code: " << result << ")" << std::endl; \ + exit(EXIT_FAILURE); \ + } \ } while (0) #endif /* UTIL_H */ diff --git a/test/nixl/agent_example.cpp b/test/nixl/agent_example.cpp index 9de8a9b95..221571f75 100644 --- a/test/nixl/agent_example.cpp +++ b/test/nixl/agent_example.cpp @@ -29,7 +29,7 @@ void check_buf(void* buf, size_t len) { // Do some checks on the data. for(size_t i = 0; iregisterMem(mem_list1, &extra_params1); CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent1); @@ -237,7 +243,8 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 for (int update = 0; update < NUM_UPDATES; update++) { nixlDlistH *dst_side; status = A1->prepXferDlist(agent2, dst_mem_lists[update].trim(), dst_side, &extra_params1); - CHECK_NIXL_ERROR_AGENT((status == NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); + CHECK_NIXL_ERROR_AGENT( + (status == NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); CHECK_NIXL_ERROR_AGENT((dst_side != nullptr), "Dst side is not null", agent1); } @@ -721,7 +728,8 @@ main(int argc, char **argv) { std::vector agent1_notifs = notif_map[agent1]; CHECK_NIXL_ERROR_AGENT((agent1_notifs.size() != 1), "Incorrect notif size", agent1); - CHECK_NIXL_ERROR_AGENT((agent1_notifs.front() != "notification"), "Incorrect notification", agent1); + CHECK_NIXL_ERROR_AGENT( + (agent1_notifs.front() != "notification"), "Incorrect notification", agent1); notif_map[agent1].clear(); // Redundant, for testing notif_map.clear(); n_notifs = 0; @@ -740,7 +748,7 @@ main(int argc, char **argv) { extra_params1.notifMsg = "local_notif"; extra_params1.hasNotif = true; ret2 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_ldst_descs, agent1, req_handle2, &extra_params1); - CHECK_NIXL_ERROR_AGENT (ret1, "Failed to create Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create Xfer Req", agent1); status = A1.postXferReq(req_handle2); std::cout << "Local transfer was posted\n"; @@ -756,10 +764,12 @@ main(int argc, char **argv) { agent1_notifs = notif_map[agent1]; CHECK_NIXL_ERROR_AGENT((agent1_notifs.size() != 1), "Incorrect notif size", agent1); - CHECK_NIXL_ERROR_AGENT((agent1_notifs.front() != "local_notif"), "Incorrect notification", agent1); - CHECK_NIXL_ERROR_AGENT((!equal_buf((void *)req_src.addr, (void *)req_ldst.addr, req_size) == true), - "Buffer mismatch after transfer", - agent1); + CHECK_NIXL_ERROR_AGENT( + (agent1_notifs.front() != "local_notif"), "Incorrect notification", agent1); + CHECK_NIXL_ERROR_AGENT( + (!equal_buf((void *)req_src.addr, (void *)req_ldst.addr, req_size) == true), + "Buffer mismatch after transfer", + agent1); ret1 = A1.releaseXferReq(req_handle); ret2 = A1.releaseXferReq(req_handle2); CHECK_NIXL_ERROR_AGENT(ret1, "Failed to release Xfer Req", agent1); diff --git a/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu b/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu index afd6b18b2..619ae546b 100644 --- a/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu +++ b/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu @@ -316,10 +316,16 @@ main (int argc, char *argv[]) { std::cout << " Received checkRemoteMD from " << initiator << std::endl; data_address_ptr = (uintptr_t)data_address; - CHECK_NIXL_ERROR_AGENT(serdes->addBuf ("BaseAddress", &data_address_ptr, sizeof (uintptr_t)), "Failed to add BaseAddress", role); - CHECK_NIXL_ERROR_AGENT(serdes->addBuf ("BufferSize", &buf_size, sizeof (size_t)), "Failed to add BufferSize", role); - CHECK_NIXL_ERROR_AGENT(serdes->addBuf ("BufferTransfer", &buf_num, sizeof (uint32_t)), "Failed to add BufferTransfer", role); - CHECK_NIXL_ERROR_AGENT(serdes->addStr ("AgentMD", metadata), "Failed to add AgentMD", role); + CHECK_NIXL_ERROR_AGENT(serdes->addBuf("BaseAddress", &data_address_ptr, sizeof(uintptr_t)), + "Failed to add BaseAddress", + role); + CHECK_NIXL_ERROR_AGENT(serdes->addBuf("BufferSize", &buf_size, sizeof(size_t)), + "Failed to add BufferSize", + role); + CHECK_NIXL_ERROR_AGENT(serdes->addBuf("BufferTransfer", &buf_num, sizeof(uint32_t)), + "Failed to add BufferTransfer", + role); + CHECK_NIXL_ERROR_AGENT(serdes->addStr("AgentMD", metadata), "Failed to add AgentMD", role); std::string message = serdes->exportStr(); while (agent.genNotif (initiator, message, &extra_params) != NIXL_SUCCESS) ; @@ -429,9 +435,17 @@ main (int argc, char *argv[]) { for (const auto ¬if : notifs[target]) { remote_serdes->importStr (notif); - CHECK_NIXL_ERROR_AGENT(remote_serdes->getBuf ("BaseAddress", &data_address_ptr, sizeof (uintptr_t)), "Failed to get BaseAddress", role); - CHECK_NIXL_ERROR_AGENT(remote_serdes->getBuf ("BufferSize", &buf_size, sizeof (size_t)), "Failed to get BufferSize", role); - CHECK_NIXL_ERROR_AGENT(remote_serdes->getBuf ("BufferTransfer", &buf_num, sizeof (uint32_t)), "Failed to get BufferTransfer", role); + CHECK_NIXL_ERROR_AGENT( + remote_serdes->getBuf("BaseAddress", &data_address_ptr, sizeof(uintptr_t)), + "Failed to get BaseAddress", + role); + CHECK_NIXL_ERROR_AGENT(remote_serdes->getBuf("BufferSize", &buf_size, sizeof(size_t)), + "Failed to get BufferSize", + role); + CHECK_NIXL_ERROR_AGENT( + remote_serdes->getBuf("BufferTransfer", &buf_num, sizeof(uint32_t)), + "Failed to get BufferTransfer", + role); remote_metadata = remote_serdes->getStr ("AgentMD"); CHECK_NIXL_ERROR_AGENT((remote_metadata != ""), "Failed to get AgentMD", role); agent.loadRemoteMD (remote_metadata, target); @@ -493,7 +507,9 @@ main (int argc, char *argv[]) { std::cout << "Post the request with GPUNETIO backend transfer 1" << std::endl; PUSH_RANGE ("postXferReq", 3) status = agent.postXferReq (treq); - CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), "Failed to post Xfer Req", role); + CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), + "Failed to post Xfer Req", + role); POP_RANGE @@ -501,7 +517,9 @@ main (int argc, char *argv[]) { PUSH_RANGE ("getXferStatus", 4) while (status != NIXL_SUCCESS) { status = agent.getXferStatus (treq); - CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), "Failed to get Xfer Status", role); + CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), + "Failed to get Xfer Status", + role); } POP_RANGE // No need for cudaStreamSyncronize as CUDA kernel and Xfer are on the same stream @@ -537,7 +555,9 @@ main (int argc, char *argv[]) { PUSH_RANGE ("getXferStatus", 4) while (status != NIXL_SUCCESS) { status = agent.getXferStatus (treq); - CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), "Failed to get Xfer Status", role); + CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), + "Failed to get Xfer Status", + role); } POP_RANGE } else { @@ -557,7 +577,9 @@ main (int argc, char *argv[]) { PUSH_RANGE ("getXferStatus", 4) while (status != NIXL_SUCCESS) { status = agent.getXferStatus (treq); - CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), "Failed to get Xfer Status", role); + CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), + "Failed to get Xfer Status", + role); } POP_RANGE diff --git a/test/unit/plugins/ucx/ucx_backend_test.cpp b/test/unit/plugins/ucx/ucx_backend_test.cpp index 52eb9a2dd..58d8f4938 100644 --- a/test/unit/plugins/ucx/ucx_backend_test.cpp +++ b/test/unit/plugins/ucx/ucx_backend_test.cpp @@ -338,7 +338,7 @@ loadRemote(nixlUcxEngine *ucx, CHECK_NIXL_ERROR((info.metaInfo.size() == 0), "Failed to get public data"); // We get the data from the cetnral location and populate the backend, and receive remote_meta - int ret = ucx->loadRemoteMD (info, mem_type, agent, rmd); + int ret = ucx->loadRemoteMD(info, mem_type, agent, rmd); CHECK_NIXL_ERROR(ret, "Failed to load remote MD"); } @@ -401,7 +401,8 @@ performTransfer(nixlUcxEngine *ucx1, // Also maybe we would remove the WRITE and let the backend class decide the op if (hiter.needPrep()) { nixlBackendReqH *new_handle = nullptr; - ret3 = ucx1->prepXfer(op, req_src_descs, req_dst_descs, remote_agent, new_handle, &opt_args); + ret3 = + ucx1->prepXfer(op, req_src_descs, req_dst_descs, remote_agent, new_handle, &opt_args); CHECK_NIXL_ERROR(ret3, "Failed to prep xfer"); hiter.setHandle(new_handle); } @@ -460,7 +461,7 @@ performTransfer(nixlUcxEngine *ucx1, chkptr2 = getValidationPtr(req_dst_descs.getType(), addr2, len); // Perform correctness check. - for(size_t i = 0; i < len; i++){ + for (size_t i = 0; i < len; i++) { CHECK_NIXL_ERROR((((uint8_t *)chkptr1)[i] != ((uint8_t *)chkptr2)[i]), "Data mismatch"); } @@ -492,7 +493,7 @@ test_intra_agent_transfer(bool p_thread, nixlUcxEngine *ucx, nixl_mem_t mem_type std::string conn_info1; ret1 = ucx->getConnInfo(conn_info1); CHECK_NIXL_ERROR(ret1, "Failed to get conn info"); - ret1 = ucx->loadRemoteConnInfo (agent1, conn_info1); + ret1 = ucx->loadRemoteConnInfo(agent1, conn_info1); CHECK_NIXL_ERROR(ret1, "Failed to load remote conn info"); std::cout << "Local connection complete\n"; @@ -510,7 +511,7 @@ test_intra_agent_transfer(bool p_thread, nixlUcxEngine *ucx, nixl_mem_t mem_type //string descs unnecessary, convert meta locally nixlBackendMD* rmd2; - ret1 = ucx->loadLocalMD (lmd2, rmd2); + ret1 = ucx->loadLocalMD(lmd2, rmd2); CHECK_NIXL_ERROR(ret1, "Failed to load local MD"); nixl_meta_dlist_t req_src_descs (mem_type); populateDescs(req_src_descs, 0, addr1, desc_cnt, desc_size, lmd1); @@ -581,7 +582,7 @@ test_inter_agent_transfer(bool p_thread, CHECK_NIXL_ERROR(ret, "Failed to get conn info"); // We assumed we put them to central location and now receiving it on the other process - ret = ucx1->loadRemoteConnInfo (agent2, conn_info2); + ret = ucx1->loadRemoteConnInfo(agent2, conn_info2); CHECK_NIXL_ERROR(ret, "Failed to load remote conn info"); // TODO: Causes race condition - investigate conn management implementation @@ -658,7 +659,8 @@ test_inter_agent_transfer(bool p_thread, CHECK_NIXL_ERROR((ret != 1), "Incorrect number of target notifs"); CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); - CHECK_NIXL_ERROR((target_notifs.front().second != test_str), "Incorrect front notif message"); + CHECK_NIXL_ERROR((target_notifs.front().second != test_str), + "Incorrect front notif message"); cout << "OK" << endl; } diff --git a/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp b/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp index f7a646b2f..ca9a25c1a 100644 --- a/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp +++ b/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp @@ -84,7 +84,7 @@ nixlBackendEngine *createEngine(std::string name, uint32_t ndev, bool p_thread) init.customParams = &custom_params; init.type = "UCX_MO"; - ucx_mo = (nixlBackendEngine*) new nixlUcxMoEngine (&init); + ucx_mo = (nixlBackendEngine *)new nixlUcxMoEngine(&init); CHECK_NIXL_ERROR(ucx_mo->getInitErr(), "Failed to initialize worker1"); if (ucx_mo->getInitErr()) { std::cout << "Failed to initialize worker1" << std::endl; @@ -320,7 +320,7 @@ void destroyRemoteDescs(nixlBackendEngine *dst_ucx, { nixl_status_t status; for(int i = 0; i < dst_descs.descCount(); i++) { - status = dst_ucx->unloadMD (dst_descs[i].metadataP); + status = dst_ucx->unloadMD(dst_descs[i].metadataP); CHECK_NIXL_ERROR(status, "Failed to unload dst_ucx MD"); } @@ -397,7 +397,8 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, cout << "\t\tData verification: " << flush; - CHECK_NIXL_ERROR((req_src_descs.descCount() != req_dst_descs.descCount()), "Data length mismatch"); + CHECK_NIXL_ERROR((req_src_descs.descCount() != req_dst_descs.descCount()), + "Data length mismatch"); for(int i = 0; i < req_src_descs.descCount(); i++) { auto sdesc = req_src_descs[i]; auto ddesc = req_dst_descs[i]; @@ -407,7 +408,7 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, chkptr2 = getValidationPtr(req_dst_descs.getType(), (void*)ddesc.addr, len); // Perform correctness check. - for(size_t i = 0; i < len; i++){ + for (size_t i = 0; i < len; i++) { CHECK_NIXL_ERROR((((uint8_t *)chkptr1)[i] != ((uint8_t *)chkptr2)[i]), "Data mismatch"); } @@ -457,7 +458,7 @@ void test_agent_transfer(bool p_thread, if (is_local) { agent = &agent1; } - status = ucx1->loadRemoteConnInfo (*agent, conn_info2); + status = ucx1->loadRemoteConnInfo(*agent, conn_info2); CHECK_NIXL_ERROR(status, "Failed to load ucx1 remote conn info"); // TODO: Causes race condition - investigate conn management implementation // ret = ucx2->loadRemoteConnInfo (agent1, conn_info1); From 492a0b148db4ff7c4ff02de32df98dea3fb6635c Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Wed, 8 Oct 2025 02:54:15 -0700 Subject: [PATCH 06/17] Fix examples build Signed-off-by: Adit Ranadive --- test/nixl/agent_example.cpp | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/test/nixl/agent_example.cpp b/test/nixl/agent_example.cpp index 221571f75..4369ff5dd 100644 --- a/test/nixl/agent_example.cpp +++ b/test/nixl/agent_example.cpp @@ -574,16 +574,10 @@ main(int argc, char **argv) { for (nixl_backend_t b: plugins) std::cout << b << "\n"; -<<<<<<< HEAD - std::cout << "Using backend: " << backend << "\n"; - ret1 = A1.getPluginParams(backend, mems1, init1); - ret2 = A2.getPluginParams(backend, mems2, init2); -======= ret1 = A1.getPluginParams("UCX", mems1, init1); ret2 = A2.getPluginParams("UCX", mems2, init2); CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get plugin params for UCX", agent1); CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get plugin params for UCX", agent2); ->>>>>>> 5638c578 (Address comments) std::cout << "Params before init:\n"; printParams(init1, mems1); @@ -594,31 +588,16 @@ main(int argc, char **argv) { ret2 = A2.createBackend(backend, init2, bknd2); nixl_opt_args_t extra_params1, extra_params2; -<<<<<<< HEAD extra_params1.backends.push_back(bknd1); extra_params2.backends.push_back(bknd2); - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to create UCX backend", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to create UCX backend", agent2); - ret1 = A1.getBackendParams(bknd1, mems1, init1); - ret2 = A2.getBackendParams(bknd2, mems2, init2); - - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); -======= - extra_params1.backends.push_back(ucx1); - extra_params2.backends.push_back(ucx2); CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create UCX backend", agent1); CHECK_NIXL_ERROR_AGENT(ret2, "Failed to create UCX backend", agent2); - ret1 = A1.getBackendParams(ucx1, mems1, init1); - ret2 = A2.getBackendParams(ucx2, mems2, init2); + ret1 = A1.getBackendParams(bknd1, mems1, init1); + ret2 = A2.getBackendParams(bknd2, mems2, init2); CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get UCX backend params", agent1); CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get UCX backend params", agent2); ->>>>>>> 5638c578 (Address comments) std::cout << "Params after init:\n"; printParams(init1, mems1); @@ -736,12 +715,12 @@ main(int argc, char **argv) { std::cout << "Transfer verified\n"; - std::cout << "performing partialMdTest with backends " << ucx1 << " " << ucx2 << "\n"; - ret1 = partialMdTest(&A1, &A2, ucx1, ucx2); + std::cout << "performing partialMdTest with backends " << bknd1 << " " << bknd2 << "\n"; + ret1 = partialMdTest(&A1, &A2, bknd1, bknd2); CHECK_NIXL_ERROR_AGENT(ret1, "Fail to run partialMDTest", agent1); - std::cout << "performing sideXferTest with backends " << ucx1 << " " << ucx2 << "\n"; - ret1 = sideXferTest(&A1, &A2, req_handle, ucx2); + std::cout << "performing sideXferTest with backends " << bknd1 << " " << bknd2 << "\n"; + ret1 = sideXferTest(&A1, &A2, req_handle, bknd2); CHECK_NIXL_ERROR_AGENT(ret1, "Fail to run sideXferTest", agent1); std::cout << "Performing local test\n"; From a5984c50bc5c52a25afe79808eca4adb8aebfcb7 Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Wed, 8 Oct 2025 11:51:40 -0700 Subject: [PATCH 07/17] Fix some tests Signed-off-by: Adit Ranadive --- test/nixl/agent_example.cpp | 12 +++---- test/nixl/desc_example.cpp | 67 +++++++++++++++++++------------------ 2 files changed, 40 insertions(+), 39 deletions(-) diff --git a/test/nixl/agent_example.cpp b/test/nixl/agent_example.cpp index 4369ff5dd..35814ba21 100644 --- a/test/nixl/agent_example.cpp +++ b/test/nixl/agent_example.cpp @@ -29,7 +29,7 @@ void check_buf(void* buf, size_t len) { // Do some checks on the data. for(size_t i = 0; iprepXferDlist(agent2, dst_mem_lists[invalid_idx].trim(), dst_side, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Prep xfer dlist should not be successful", agent1); + CHECK_NIXL_ERROR_AGENT((status == NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); CHECK_NIXL_ERROR_AGENT((dst_side != nullptr), "Dst side is not null", agent1); } std::cout << "Metadata update #" << update << " completed\n"; @@ -329,7 +329,7 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Wait for transfer completion while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req); - CHECK_NIXL_ERROR_AGENT(xfer_status, "Failed to get xfer status", agent1); + CHECK_NIXL_ERROR_AGENT((xfer_status > NIXL_IN_PROG), "Failed to get xfer status", agent1); } // Verify transfer results @@ -456,7 +456,7 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req1); - CHECK_NIXL_ERROR_AGENT(xfer_status, "Failed to get xfer status", agent1); + CHECK_NIXL_ERROR_AGENT((xfer_status > NIXL_IN_PROG), "Failed to get xfer status", agent1); } for(int i = 0; i<(n_bufs/2); i++) @@ -471,7 +471,7 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req2); - CHECK_NIXL_ERROR_AGENT(xfer_status, "Failed to get xfer status", agent1); + CHECK_NIXL_ERROR_AGENT((xfer_status > NIXL_IN_PROG), "Failed to get xfer status", agent1); } for(int i = (n_bufs/2); igetXferStatus(req3); - CHECK_NIXL_ERROR_AGENT(xfer_status, "Failed to get xfer status", agent1); + CHECK_NIXL_ERROR_AGENT((xfer_status > NIXL_IN_PROG), "Failed to get xfer status", agent1); } for(int i = (n_bufs/2); i #include -#include + #include "nixl.h" #include "serdes/serdes.h" #include "backend/backend_aux.h" +#include "common/util.h" #include @@ -38,7 +39,7 @@ void testPerf(){ gettimeofday(&end_time, NULL); - assert(dlist.descCount() == 24*64*1024); + CHECK_NIXL_ERROR((dlist.descCount() != 24*64*1024), "Incorrect number of descriptors"); timersub(&end_time, &start_time, &diff_time); std::cout << "add desc mode, total time for " << 24*64*1024 << " descs: " << diff_time.tv_sec << "s " << diff_time.tv_usec << "us \n"; @@ -57,7 +58,7 @@ void testPerf(){ gettimeofday(&end_time, NULL); - assert(dlist.descCount() == 24*64*1024); + CHECK_NIXL_ERROR((dlist.descCount() != 24*64*1024), "Incorrect number of descriptors"); timersub(&end_time, &start_time, &diff_time); std::cout << "Operator [] mode, total time for " << 24*64*1024 << " descs: " << diff_time.tv_sec << "s " << diff_time.tv_usec << "us \n"; @@ -87,27 +88,27 @@ int main() nixlBasicDesc buff8 (1010,31,0); nixlBasicDesc importDesc(buff2.serialize()); - assert(buff2 == importDesc); - - assert (buff3==buff2); - assert (buff4==buff1); - assert (buff3!=buff1); - assert (buff8!=buff7); - - assert (buff2.covers(buff3)); - assert (buff4.overlaps(buff1)); - assert (!buff1.covers(buff2)); - assert (!buff1.overlaps(buff2)); - assert (!buff2.covers(buff1)); - assert (!buff2.overlaps(buff1)); - assert (buff2.overlaps(buff5)); - assert (buff5.overlaps(buff2)); - assert (!buff2.covers(buff5)); - assert (!buff5.covers(buff2)); - assert (!buff1.covers(buff6)); - assert (!buff6.covers(buff1)); - assert (buff1.covers(buff7)); - assert (!buff7.covers(buff1)); + CHECK_NIXL_ERROR(!(buff2 == importDesc), "Descriptor mismatch for buff2 and importDesc"); + + CHECK_NIXL_ERROR(!(buff3==buff2), "Descriptor mismatch for buff3 and buff2"); + CHECK_NIXL_ERROR(!(buff4==buff1), "Descriptor mismatch for buff4 and buff1"); + CHECK_NIXL_ERROR(!(buff3!=buff1), "Descriptor mismatch for buff3 and buff1"); + CHECK_NIXL_ERROR(!(buff8!=buff7), "Descriptor mismatch for buff8 and buff7"); + + CHECK_NIXL_ERROR(!(buff2.covers(buff3)), "Descriptor mismatch for buff2 and buff3"); + CHECK_NIXL_ERROR(!(buff4.overlaps(buff1)), "Descriptor mismatch for buff4 and buff1"); + CHECK_NIXL_ERROR((buff1.covers(buff2)), "Descriptor mismatch for buff1 and buff2"); + CHECK_NIXL_ERROR((buff1.overlaps(buff2)), "Descriptor mismatch for buff1 and buff2"); + CHECK_NIXL_ERROR((buff2.covers(buff1)), "Descriptor mismatch for buff2 and buff1"); + CHECK_NIXL_ERROR((buff2.overlaps(buff1)), "Descriptor mismatch for buff2 and buff1"); + CHECK_NIXL_ERROR(!(buff2.overlaps(buff5)), "Descriptor mismatch for buff2 and buff5"); + CHECK_NIXL_ERROR(!(buff5.overlaps(buff2)), "Descriptor mismatch for buff5 and buff2"); + CHECK_NIXL_ERROR((buff2.covers(buff5)), "Descriptor mismatch for buff2 and buff5"); + CHECK_NIXL_ERROR((buff5.covers(buff2)), "Descriptor mismatch for buff5 and buff2"); + CHECK_NIXL_ERROR((buff1.covers(buff6)), "Descriptor mismatch for buff1 and buff6"); + CHECK_NIXL_ERROR((buff6.covers(buff1)), "Descriptor mismatch for buff6 and buff1"); + CHECK_NIXL_ERROR(!(buff1.covers(buff7)), "Descriptor mismatch for buff1 and buff7"); + CHECK_NIXL_ERROR((buff7.covers(buff1)), "Descriptor mismatch for buff7 and buff1"); nixlBlobDesc stringd1; stringd1.addr = 2392382; @@ -116,7 +117,7 @@ int main() stringd1.metaInfo = std::string("567"); nixlBlobDesc importStringD(stringd1.serialize()); - assert(stringd1 == importStringD); + CHECK_NIXL_ERROR(!(stringd1 == importStringD), "Descriptor mismatch for stringd1 and importStringD"); std::cout << "\nSerDes Desc tests:\n"; buff2.print(""); @@ -146,8 +147,8 @@ int main() meta2.devId = 0; meta2.metadataP = nullptr; - assert (stringd1!=buff1); - assert (stringd2==buff8); + CHECK_NIXL_ERROR(!(stringd1!=buff1), "Descriptor mismatch for stringd1 and buff1"); + CHECK_NIXL_ERROR(!(stringd2==buff8), "Descriptor mismatch for stringd2 and buff8"); nixlBasicDesc buff9 (stringd1); buff1.print(""); @@ -200,8 +201,8 @@ int main() std::cout << "Caught expected error: " << e.what() << std::endl; } dlist2.remDesc(dlist2.getIndex(meta3)); - assert(dlist2.getIndex(meta3)== NIXL_ERR_NOT_FOUND); - assert(dlist3.getIndex(meta1)== NIXL_ERR_NOT_FOUND); + CHECK_NIXL_ERROR(!(dlist2.getIndex(meta3) == NIXL_ERR_NOT_FOUND), "Dlist2 descriptor not removed"); + CHECK_NIXL_ERROR(!(dlist3.getIndex(meta1) == NIXL_ERR_NOT_FOUND), "Dlist3 descriptor not removed"); try { dlist3.remDesc(dlist3.getIndex(meta4)); } catch (const std::out_of_range& e) { @@ -277,13 +278,13 @@ int main() nixlSerDes* ser_des = new nixlSerDes(); nixlSerDes* ser_des2 = new nixlSerDes(); - assert(dlist10.serialize(ser_des) == 0); + CHECK_NIXL_ERROR((dlist10.serialize(ser_des) != 0), "Failed to serialize dlist10"); nixl_xfer_dlist_t importList (ser_des);; - assert(importList == dlist10); + CHECK_NIXL_ERROR(!(importList == dlist10), "Descriptor mismatch for importList and dlist10"); - assert(dlist20.serialize(ser_des2) == 0); + CHECK_NIXL_ERROR((dlist20.serialize(ser_des2) != 0), "Failed to serialize dlist20"); nixl_reg_dlist_t importSList (ser_des2); - assert(importSList == dlist20); + CHECK_NIXL_ERROR(!(importSList == dlist20), "Descriptor mismatch for importSList and dlist20"); dlist10.print(); std::cout << "this should be a copy:\n"; From 931d383164c6880d26764bfd0a2c6189ad3f016f Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Wed, 8 Oct 2025 11:52:46 -0700 Subject: [PATCH 08/17] Fix clang Signed-off-by: Adit Ranadive --- test/nixl/agent_example.cpp | 3 ++- test/nixl/desc_example.cpp | 25 ++++++++++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/test/nixl/agent_example.cpp b/test/nixl/agent_example.cpp index 35814ba21..319be6990 100644 --- a/test/nixl/agent_example.cpp +++ b/test/nixl/agent_example.cpp @@ -280,7 +280,8 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Make sure not-loaded descriptors are not updated for (int invalid_idx = update + 1; invalid_idx < NUM_UPDATES; invalid_idx++) { status = A1->prepXferDlist(agent2, dst_mem_lists[invalid_idx].trim(), dst_side, &extra_params1); - CHECK_NIXL_ERROR_AGENT((status == NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); + CHECK_NIXL_ERROR_AGENT( + (status == NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); CHECK_NIXL_ERROR_AGENT((dst_side != nullptr), "Dst side is not null", agent1); } std::cout << "Metadata update #" << update << " completed\n"; diff --git a/test/nixl/desc_example.cpp b/test/nixl/desc_example.cpp index 39217bf84..93adebbfa 100644 --- a/test/nixl/desc_example.cpp +++ b/test/nixl/desc_example.cpp @@ -39,7 +39,7 @@ void testPerf(){ gettimeofday(&end_time, NULL); - CHECK_NIXL_ERROR((dlist.descCount() != 24*64*1024), "Incorrect number of descriptors"); + CHECK_NIXL_ERROR((dlist.descCount() != 24 * 64 * 1024), "Incorrect number of descriptors"); timersub(&end_time, &start_time, &diff_time); std::cout << "add desc mode, total time for " << 24*64*1024 << " descs: " << diff_time.tv_sec << "s " << diff_time.tv_usec << "us \n"; @@ -58,7 +58,7 @@ void testPerf(){ gettimeofday(&end_time, NULL); - CHECK_NIXL_ERROR((dlist.descCount() != 24*64*1024), "Incorrect number of descriptors"); + CHECK_NIXL_ERROR((dlist.descCount() != 24 * 64 * 1024), "Incorrect number of descriptors"); timersub(&end_time, &start_time, &diff_time); std::cout << "Operator [] mode, total time for " << 24*64*1024 << " descs: " << diff_time.tv_sec << "s " << diff_time.tv_usec << "us \n"; @@ -90,10 +90,10 @@ int main() nixlBasicDesc importDesc(buff2.serialize()); CHECK_NIXL_ERROR(!(buff2 == importDesc), "Descriptor mismatch for buff2 and importDesc"); - CHECK_NIXL_ERROR(!(buff3==buff2), "Descriptor mismatch for buff3 and buff2"); - CHECK_NIXL_ERROR(!(buff4==buff1), "Descriptor mismatch for buff4 and buff1"); - CHECK_NIXL_ERROR(!(buff3!=buff1), "Descriptor mismatch for buff3 and buff1"); - CHECK_NIXL_ERROR(!(buff8!=buff7), "Descriptor mismatch for buff8 and buff7"); + CHECK_NIXL_ERROR(!(buff3 == buff2), "Descriptor mismatch for buff3 and buff2"); + CHECK_NIXL_ERROR(!(buff4 == buff1), "Descriptor mismatch for buff4 and buff1"); + CHECK_NIXL_ERROR(!(buff3 != buff1), "Descriptor mismatch for buff3 and buff1"); + CHECK_NIXL_ERROR(!(buff8 != buff7), "Descriptor mismatch for buff8 and buff7"); CHECK_NIXL_ERROR(!(buff2.covers(buff3)), "Descriptor mismatch for buff2 and buff3"); CHECK_NIXL_ERROR(!(buff4.overlaps(buff1)), "Descriptor mismatch for buff4 and buff1"); @@ -117,7 +117,8 @@ int main() stringd1.metaInfo = std::string("567"); nixlBlobDesc importStringD(stringd1.serialize()); - CHECK_NIXL_ERROR(!(stringd1 == importStringD), "Descriptor mismatch for stringd1 and importStringD"); + CHECK_NIXL_ERROR(!(stringd1 == importStringD), + "Descriptor mismatch for stringd1 and importStringD"); std::cout << "\nSerDes Desc tests:\n"; buff2.print(""); @@ -147,8 +148,8 @@ int main() meta2.devId = 0; meta2.metadataP = nullptr; - CHECK_NIXL_ERROR(!(stringd1!=buff1), "Descriptor mismatch for stringd1 and buff1"); - CHECK_NIXL_ERROR(!(stringd2==buff8), "Descriptor mismatch for stringd2 and buff8"); + CHECK_NIXL_ERROR(!(stringd1 != buff1), "Descriptor mismatch for stringd1 and buff1"); + CHECK_NIXL_ERROR(!(stringd2 == buff8), "Descriptor mismatch for stringd2 and buff8"); nixlBasicDesc buff9 (stringd1); buff1.print(""); @@ -201,8 +202,10 @@ int main() std::cout << "Caught expected error: " << e.what() << std::endl; } dlist2.remDesc(dlist2.getIndex(meta3)); - CHECK_NIXL_ERROR(!(dlist2.getIndex(meta3) == NIXL_ERR_NOT_FOUND), "Dlist2 descriptor not removed"); - CHECK_NIXL_ERROR(!(dlist3.getIndex(meta1) == NIXL_ERR_NOT_FOUND), "Dlist3 descriptor not removed"); + CHECK_NIXL_ERROR(!(dlist2.getIndex(meta3) == NIXL_ERR_NOT_FOUND), + "Dlist2 descriptor not removed"); + CHECK_NIXL_ERROR(!(dlist3.getIndex(meta1) == NIXL_ERR_NOT_FOUND), + "Dlist3 descriptor not removed"); try { dlist3.remDesc(dlist3.getIndex(meta4)); } catch (const std::out_of_range& e) { From 8499f9fdec2517c13618a6ede384b62f4aadbd80 Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Wed, 8 Oct 2025 13:35:14 -0700 Subject: [PATCH 09/17] Fix an example Signed-off-by: Adit Ranadive --- examples/cpp/nixl_etcd_example.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/cpp/nixl_etcd_example.cpp b/examples/cpp/nixl_etcd_example.cpp index c2a3c4701..c23f60933 100644 --- a/examples/cpp/nixl_etcd_example.cpp +++ b/examples/cpp/nixl_etcd_example.cpp @@ -233,6 +233,7 @@ int main() { CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create Xfer Req", AGENT1_NAME); status = A1.postXferReq(req_handle); + CHECK_NIXL_ERROR_AGENT((status != NIXL_IN_PROG), "Failed to post Xfer Req", AGENT1_NAME); std::cout << "Transfer was posted\n"; @@ -242,8 +243,8 @@ int main() { while (status != NIXL_SUCCESS || n_notifs == 0) { if (status != NIXL_SUCCESS) status = A1.getXferStatus(req_handle); if (n_notifs == 0) ret2 = A2.getNotifs(notif_map); - CHECK_NIXL_ERROR_AGENT(status, "Failed to post Xfer Req", AGENT1_NAME); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get notifs", AGENT2_NAME); + CHECK_NIXL_ERROR_AGENT((status > NIXL_IN_PROG), "Failed to get Xfer status", AGENT1_NAME); + CHECK_NIXL_ERROR_AGENT((ret2 != NIXL_SUCCESS), "Failed to get notifs", AGENT2_NAME); n_notifs = notif_map.size(); } From b2fbe86bb27eb34c238f2231becfc7f7474c21e5 Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Wed, 8 Oct 2025 14:04:27 -0700 Subject: [PATCH 10/17] Fix more tests Signed-off-by: Adit Ranadive --- test/unit/plugins/ucx/ucx_backend_test.cpp | 1 - test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/test/unit/plugins/ucx/ucx_backend_test.cpp b/test/unit/plugins/ucx/ucx_backend_test.cpp index 58d8f4938..344b86bce 100644 --- a/test/unit/plugins/ucx/ucx_backend_test.cpp +++ b/test/unit/plugins/ucx/ucx_backend_test.cpp @@ -408,7 +408,6 @@ performTransfer(nixlUcxEngine *ucx1, } nixlBackendReqH *&handle = hiter.getHandle(); ret3 = ucx1->postXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); - CHECK_NIXL_ERROR(ret3, "Failed to post xfer"); CHECK_NIXL_ERROR(!((ret3 == NIXL_SUCCESS) || (ret3 == NIXL_IN_PROG)), "Failed to post xfer"); if (ret3 == NIXL_SUCCESS) { diff --git a/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp b/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp index ca9a25c1a..5ec943aab 100644 --- a/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp +++ b/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp @@ -354,7 +354,7 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, status = ucx1->prepXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); CHECK_NIXL_ERROR(status, "Failed to prep ucx1 xfer"); status = ucx1->postXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); - CHECK_NIXL_ERROR(status, "Failed to post ucx1 xfer"); + CHECK_NIXL_ERROR((status > NIXL_IN_PROG), "Failed to post ucx1 xfer"); if (status == NIXL_SUCCESS) { From 1580d05ad8048f9362944e3b184d7205371c537c Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Wed, 8 Oct 2025 15:39:16 -0700 Subject: [PATCH 11/17] Fix nixl_test Signed-off-by: Adit Ranadive --- test/nixl/nixl_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/nixl/nixl_test.cpp b/test/nixl/nixl_test.cpp index 5f2eb609d..93417fbc1 100644 --- a/test/nixl/nixl_test.cpp +++ b/test/nixl/nixl_test.cpp @@ -194,7 +194,7 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params, while (ret != NIXL_SUCCESS) { ret = agent.getXferStatus(treq); - CHECK_NIXL_ERROR(ret, "Failed to get transfer status"); + CHECK_NIXL_ERROR((ret > NIXL_IN_PROG), "Failed to get transfer status"); } std::cout << "Thread " << thread_id << " Completed Sending Data using UCX backend\n"; From 84c2cb295626df59ba8494043f431bba43fb34f7 Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Wed, 8 Oct 2025 18:46:18 -0700 Subject: [PATCH 12/17] Fix nixl_example test Signed-off-by: Adit Ranadive --- examples/cpp/nixl_example.cpp | 91 +++++++++-------------------------- 1 file changed, 24 insertions(+), 67 deletions(-) diff --git a/examples/cpp/nixl_example.cpp b/examples/cpp/nixl_example.cpp index 9d93a4461..2b4c96f7f 100644 --- a/examples/cpp/nixl_example.cpp +++ b/examples/cpp/nixl_example.cpp @@ -21,15 +21,8 @@ #include #include "nixl.h" +#include "common/util.h" -#define CHECK_NIXL_ERROR(result, message, agent) \ - do { \ - if (0 != result) { \ - std::cerr << "NIXL: " << message << " for agent " << agent \ - << " (Error code: " << result << ")" << std::endl; \ - exit(EXIT_FAILURE); \ - } \ - } while (0) std::string agent1("Agent001"); std::string agent2("Agent002"); @@ -38,11 +31,7 @@ void check_buf(void* buf, size_t len) { // Do some checks on the data. for(size_t i = 0; i plugins; ret1 = A1.getAvailPlugins(plugins); - assert (ret1 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to get available plugins", agent1); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get available plugins", agent1); std::cout << "Available plugins:\n"; @@ -115,11 +102,8 @@ main(int argc, char **argv) { ret1 = A1.getPluginParams(backend, mems1, init1); ret2 = A2.getPluginParams(backend, mems2, init2); - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to get plugin params for UCX", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to get plugin params for UCX", agent2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get plugin params", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get plugin params", agent2); std::cout << "Params before init:\n"; printParams(init1, mems1); @@ -133,20 +117,14 @@ main(int argc, char **argv) { extra_params1.backends.push_back(bknd1); extra_params2.backends.push_back(bknd2); - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to create UCX backend", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to create UCX backend", agent2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create backend", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to create backend", agent2); ret1 = A1.getBackendParams(bknd1, mems1, init1); ret2 = A2.getBackendParams(bknd2, mems2, init2); - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to get UCX backend params", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to get UCX backend params", agent2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get backend params", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get backend params", agent2); std::cout << "Params after init:\n"; printParams(init1, mems1); @@ -185,31 +163,22 @@ main(int argc, char **argv) { ret1 = A1.registerMem(dlist1, &extra_params1); ret2 = A2.registerMem(dlist2, &extra_params2); - - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to register memory", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to register memory", agent2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to register memory", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to register memory", agent2); std::string meta1; ret1 = A1.getLocalMD(meta1); std::string meta2; ret2 = A2.getLocalMD(meta2); - - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to get local MD", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to get local MD", agent2); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get local MD", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get local MD", agent2); std::cout << "Agent1's Metadata: " << meta1 << "\n"; std::cout << "Agent2's Metadata: " << meta2 << "\n"; ret1 = A1.loadRemoteMD (meta2, ret_s1); - assert (ret1 == NIXL_SUCCESS); - CHECK_NIXL_ERROR(ret1, "Failed to load remote MD", agent1); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to load remote MD", agent1); size_t req_size = 8; size_t dst_offset = 8; @@ -234,11 +203,10 @@ main(int argc, char **argv) { extra_params1.notifMsg = "notification"; extra_params1.hasNotif = true; ret1 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_dst_descs, agent2, req_handle, &extra_params1); - assert (ret1 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to create Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create Xfer Req", agent1); nixl_status_t status = A1.postXferReq(req_handle); + CHECK_NIXL_ERROR_AGENT((status > NIXL_IN_PROG), "Failed to post Xfer Req", agent1); std::cout << "Transfer was posted\n"; @@ -248,19 +216,15 @@ main(int argc, char **argv) { while (status != NIXL_SUCCESS || n_notifs == 0) { if (status != NIXL_SUCCESS) status = A1.getXferStatus(req_handle); if (n_notifs == 0) ret2 = A2.getNotifs(notif_map); - assert (status >= 0); - assert (ret2 == NIXL_SUCCESS); - CHECK_NIXL_ERROR((status < 0), "Failed to post Xfer Req", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to get notifs", agent2); + CHECK_NIXL_ERROR_AGENT((status > NIXL_IN_PROG), "Failed to post Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get notifs", agent2); n_notifs = notif_map.size(); } std::vector agent1_notifs = notif_map[agent1]; - assert (agent1_notifs.size() == 1); - assert (agent1_notifs.front() == "notification"); + CHECK_NIXL_ERROR_AGENT((agent1_notifs.size() != 1), "Incorrect notif size", agent1); + CHECK_NIXL_ERROR_AGENT((agent1_notifs.front() != "notification"), "Incorrect notification", agent1); - CHECK_NIXL_ERROR((agent1_notifs.size() != 1), "Incorrect notif size", agent1); - CHECK_NIXL_ERROR((agent1_notifs.front() != "notification"), "Incorrect notification", agent1); notif_map[agent1].clear(); // Redundant, for testing notif_map.clear(); n_notifs = 0; @@ -268,23 +232,16 @@ main(int argc, char **argv) { std::cout << "Transfer verified\n"; ret1 = A1.releaseXferReq(req_handle); - assert (ret1 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to release Xfer Req", agent1); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to release Xfer Req", agent1); ret1 = A1.deregisterMem(dlist1, &extra_params1); ret2 = A2.deregisterMem(dlist2, &extra_params2); - assert (ret1 == NIXL_SUCCESS); - assert (ret2 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to deregister memory", agent1); - CHECK_NIXL_ERROR(ret2, "Failed to deregister memory", agent2); + CHECK_NIXL_ERROR_AGENT(ret1, "F ailed to deregister memory", agent1); + CHECK_NIXL_ERROR_AGENT(ret2, "Failed to deregister memory", agent2); //only initiator should call invalidate ret1 = A1.invalidateRemoteMD(agent2); - assert (ret1 == NIXL_SUCCESS); - - CHECK_NIXL_ERROR(ret1, "Failed to invalidate remote MD", agent1); + CHECK_NIXL_ERROR_AGENT(ret1, "Failed to invalidate remote MD", agent1); free(addr1); free(addr2); From 34f32abb3da9a414c5601a6f15ba8d6ff6517bf4 Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Wed, 8 Oct 2025 18:52:07 -0700 Subject: [PATCH 13/17] Fix clang Signed-off-by: Adit Ranadive --- examples/cpp/nixl_example.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/cpp/nixl_example.cpp b/examples/cpp/nixl_example.cpp index 2b4c96f7f..b42972810 100644 --- a/examples/cpp/nixl_example.cpp +++ b/examples/cpp/nixl_example.cpp @@ -31,7 +31,7 @@ void check_buf(void* buf, size_t len) { // Do some checks on the data. for(size_t i = 0; i agent1_notifs = notif_map[agent1]; CHECK_NIXL_ERROR_AGENT((agent1_notifs.size() != 1), "Incorrect notif size", agent1); - CHECK_NIXL_ERROR_AGENT((agent1_notifs.front() != "notification"), "Incorrect notification", agent1); + CHECK_NIXL_ERROR_AGENT( + (agent1_notifs.front() != "notification"), "Incorrect notification", agent1); notif_map[agent1].clear(); // Redundant, for testing notif_map.clear(); From 645bc6e5b912485252bf7cac6613a58216416b6d Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Thu, 9 Oct 2025 01:43:13 -0700 Subject: [PATCH 14/17] infra: Add NIXL checker functions Add two functions to check status for NIXL APIs to print any error messages. In order to use the nixlEnumStrings, this needs to be defined as a separate dependency. Signed-off-by: Adit Ranadive --- examples/cpp/meson.build | 4 +- examples/cpp/nixl_etcd_example.cpp | 60 ++--- examples/cpp/nixl_example.cpp | 50 ++-- src/infra/meson.build | 3 + src/infra/test_utils.cpp | 37 +++ src/infra/test_utils.h | 49 ++++ src/meson.build | 14 + src/utils/common/util.h | 17 -- test/nixl/agent_example.cpp | 240 +++++++++--------- test/nixl/desc_example.cpp | 72 +++--- test/nixl/meson.build | 8 +- test/nixl/nixl_test.cpp | 8 +- test/unit/plugins/gpunetio/meson.build | 2 +- .../gpunetio/nixl_gpunetio_stream_test.cu | 62 ++--- test/unit/plugins/ucx/meson.build | 4 +- test/unit/plugins/ucx/ucx_backend_multi.cpp | 6 +- test/unit/plugins/ucx/ucx_backend_test.cpp | 86 ++++--- test/unit/plugins/ucx_mo/meson.build | 2 +- .../plugins/ucx_mo/ucx_mo_backend_test.cpp | 82 +++--- test/unit/utils/common/map_perf.cpp | 25 +- test/unit/utils/common/meson.build | 1 + 21 files changed, 469 insertions(+), 363 deletions(-) create mode 100644 src/infra/test_utils.cpp create mode 100644 src/infra/test_utils.h diff --git a/examples/cpp/meson.build b/examples/cpp/meson.build index 89e5ced60..66b7ff81a 100644 --- a/examples/cpp/meson.build +++ b/examples/cpp/meson.build @@ -15,7 +15,7 @@ nixl_example = executable('nixl_example', 'nixl_example.cpp', - dependencies: [nixl_dep, nixl_infra, nixl_common_deps], + dependencies: [nixl_dep, nixl_infra, nixl_common_deps, nixl_test_utils_dep], include_directories: [nixl_inc_dirs, utils_inc_dirs], link_with: [serdes_lib], install: true) @@ -23,7 +23,7 @@ nixl_example = executable('nixl_example', if etcd_dep.found() etcd_example = executable('nixl_etcd_example', 'nixl_etcd_example.cpp', - dependencies: [nixl_dep, nixl_infra, nixl_common_deps], + dependencies: [nixl_dep, nixl_infra, nixl_common_deps, nixl_test_utils_dep], include_directories: [nixl_inc_dirs, utils_inc_dirs], link_with: [serdes_lib], install: true) diff --git a/examples/cpp/nixl_etcd_example.cpp b/examples/cpp/nixl_etcd_example.cpp index c23f60933..97ac4f6bd 100644 --- a/examples/cpp/nixl_etcd_example.cpp +++ b/examples/cpp/nixl_etcd_example.cpp @@ -19,8 +19,8 @@ #include #include -#include "common/util.h" #include "nixl.h" +#include "test_utils.h" // Change these values to match your etcd setup @@ -131,7 +131,7 @@ int main() { std::vector plugins; ret1 = A1.getAvailPlugins(plugins); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get available plugins", AGENT1_NAME); + nixl_exit_on_failure(ret1, "Failed to get available plugins", AGENT1_NAME); std::cout << "Available plugins:\n"; @@ -141,8 +141,8 @@ int main() { ret1 = A1.getPluginParams("UCX", mems1, init1); ret2 = A2.getPluginParams("UCX", mems2, init2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get plugin params for UCX", AGENT1_NAME); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get plugin params for UCX", AGENT2_NAME); + nixl_exit_on_failure(ret1, "Failed to get plugin params for UCX", AGENT1_NAME); + nixl_exit_on_failure(ret2, "Failed to get plugin params for UCX", AGENT2_NAME); std::cout << "Params before init:\n"; printParams(init1, mems1); @@ -152,14 +152,14 @@ int main() { nixlBackendH* ucx1, *ucx2; ret1 = A1.createBackend("UCX", init1, ucx1); ret2 = A2.createBackend("UCX", init2, ucx2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create UCX backend", AGENT1_NAME); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to create UCX backend", AGENT2_NAME); + nixl_exit_on_failure(ret1, "Failed to create UCX backend", AGENT1_NAME); + nixl_exit_on_failure(ret2, "Failed to create UCX backend", AGENT2_NAME); ret1 = A1.getBackendParams(ucx1, mems1, init1); ret2 = A2.getBackendParams(ucx2, mems2, init2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get UCX backend params", AGENT1_NAME); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get UCX backend params", AGENT2_NAME); + nixl_exit_on_failure(ret1, "Failed to get UCX backend params", AGENT1_NAME); + nixl_exit_on_failure(ret2, "Failed to get UCX backend params", AGENT2_NAME); std::cout << "Params after init:\n"; @@ -168,9 +168,9 @@ int main() { // Register memory with both agents status = registerMemory(&addr1, &A1, &dlist1, &extra_params1, ucx1, 0xaa); - CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", AGENT1_NAME); + nixl_exit_on_failure(status, "Failed to register memory", AGENT1_NAME); status = registerMemory(&addr2, &A2, &dlist2, &extra_params2, ucx2, 0xbb); - CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", AGENT2_NAME); + nixl_exit_on_failure(status, "Failed to register memory", AGENT2_NAME); std::cout << "\nEtcd Metadata Exchange Demo\n"; std::cout << "==========================\n"; @@ -180,10 +180,10 @@ int main() { // Both agents send their metadata to etcd status = A1.sendLocalMD(); - CHECK_NIXL_ERROR_AGENT(status, "Failed to send local MD", AGENT1_NAME); + nixl_exit_on_failure(status, "Failed to send local MD", AGENT1_NAME); status = A2.sendLocalMD(); - CHECK_NIXL_ERROR_AGENT(status, "Failed to send local MD", AGENT2_NAME); + nixl_exit_on_failure(status, "Failed to send local MD", AGENT2_NAME); // Give etcd time to process std::this_thread::sleep_for(std::chrono::seconds(1)); @@ -193,11 +193,11 @@ int main() { // Agent1 fetches metadata for Agent2 status = A1.fetchRemoteMD(AGENT2_NAME); - CHECK_NIXL_ERROR_AGENT(status, "Failed to fetch remote MD", AGENT1_NAME); + nixl_exit_on_failure(status, "Failed to fetch remote MD", AGENT1_NAME); // Agent2 fetches metadata for Agent1 status = A2.fetchRemoteMD(AGENT1_NAME); - CHECK_NIXL_ERROR_AGENT(status, "Failed to fetch remote MD", AGENT2_NAME); + nixl_exit_on_failure(status, "Failed to fetch remote MD", AGENT2_NAME); // Do transfer from Agent 1 to Agent 2 size_t req_size = 8; @@ -230,10 +230,10 @@ int main() { extra_params1.hasNotif = true; ret1 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_dst_descs, AGENT2_NAME, req_handle, &extra_params1); std::cout << "Xfer request created, status: " << nixlEnumStrings::statusStr(ret1) << std::endl; - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create Xfer Req", AGENT1_NAME); + nixl_exit_on_failure(ret1, "Failed to create Xfer Req", AGENT1_NAME); status = A1.postXferReq(req_handle); - CHECK_NIXL_ERROR_AGENT((status != NIXL_IN_PROG), "Failed to post Xfer Req", AGENT1_NAME); + nixl_exit_on_failure((status >= NIXL_SUCCESS), "Failed to post Xfer Req", AGENT1_NAME); std::cout << "Transfer was posted\n"; @@ -243,20 +243,20 @@ int main() { while (status != NIXL_SUCCESS || n_notifs == 0) { if (status != NIXL_SUCCESS) status = A1.getXferStatus(req_handle); if (n_notifs == 0) ret2 = A2.getNotifs(notif_map); - CHECK_NIXL_ERROR_AGENT((status > NIXL_IN_PROG), "Failed to get Xfer status", AGENT1_NAME); - CHECK_NIXL_ERROR_AGENT((ret2 != NIXL_SUCCESS), "Failed to get notifs", AGENT2_NAME); + nixl_exit_on_failure((status >= NIXL_SUCCESS), "Failed to get Xfer status", AGENT1_NAME); + nixl_exit_on_failure(ret2, "Failed to get notifs", AGENT2_NAME); n_notifs = notif_map.size(); } std::cout << "Transfer verified\n"; ret1 = A1.releaseXferReq(req_handle); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to release Xfer Req", AGENT1_NAME); + nixl_exit_on_failure(ret1, "Failed to release Xfer Req", AGENT1_NAME); ret1 = A1.deregisterMem(dlist1, &extra_params1); ret2 = A2.deregisterMem(dlist2, &extra_params2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to deregister memory", AGENT1_NAME); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to deregister memory", AGENT2_NAME); + nixl_exit_on_failure(ret1, "Failed to deregister memory", AGENT1_NAME); + nixl_exit_on_failure(ret2, "Failed to deregister memory", AGENT2_NAME); // 3. Partial Metadata Exchange std::cout << "\n3. Sending partial metadata to etcd...\n"; @@ -277,27 +277,27 @@ int main() { // Send partial metadata status = A1.sendLocalPartialMD(empty_dlist1, &conn_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to send local partial MD", AGENT1_NAME); + nixl_exit_on_failure(status, "Failed to send local partial MD", AGENT1_NAME); status = A2.sendLocalPartialMD(empty_dlist2, &conn_params2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to send local partial MD", AGENT2_NAME); + nixl_exit_on_failure(status, "Failed to send local partial MD", AGENT2_NAME); // Send once partial with different label conn_params1.metadataLabel = PARTIAL_LABEL_2; status = A1.sendLocalPartialMD(empty_dlist1, &conn_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to send local partial MD", AGENT1_NAME); + nixl_exit_on_failure(status, "Failed to send local partial MD", AGENT1_NAME); conn_params2.metadataLabel = PARTIAL_LABEL_2; status = A2.sendLocalPartialMD(empty_dlist2, &conn_params2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to send local partial MD", AGENT2_NAME); + nixl_exit_on_failure(status, "Failed to send local partial MD", AGENT2_NAME); nixl_opt_args_t fetch_params; fetch_params.metadataLabel = PARTIAL_LABEL_1; status = A1.fetchRemoteMD(AGENT2_NAME, &fetch_params); - CHECK_NIXL_ERROR_AGENT(status, "Failed to fetch remote MD", AGENT1_NAME); + nixl_exit_on_failure(status, "Failed to fetch remote MD", AGENT1_NAME); status = A2.fetchRemoteMD(AGENT1_NAME, &fetch_params); - CHECK_NIXL_ERROR_AGENT(status, "Failed to fetch remote MD", AGENT2_NAME); + nixl_exit_on_failure(status, "Failed to fetch remote MD", AGENT2_NAME); std::this_thread::sleep_for(std::chrono::seconds(1)); @@ -306,7 +306,7 @@ int main() { // Invalidate AGENT1_NAME's metadata status = A1.invalidateLocalMD(); - CHECK_NIXL_ERROR_AGENT(status, "Failed to invalidate local MD", AGENT1_NAME); + nixl_exit_on_failure(status, "Failed to invalidate local MD", AGENT1_NAME); std::this_thread::sleep_for(std::chrono::seconds(1)); @@ -319,14 +319,14 @@ int main() { // Try invalidating again, this should log a debug message std::cout << "Trying to invalidate again...\n"; status = A1.invalidateLocalMD(); - CHECK_NIXL_ERROR_AGENT(status, "Failed to invalidate local MD", AGENT1_NAME); + nixl_exit_on_failure(status, "Failed to invalidate local MD", AGENT1_NAME); std::this_thread::sleep_for(std::chrono::seconds(1)); // 5. Fetch metadata with invalid label. This should not block forever and print error message. std::cout << "\n5. Fetching metadata with invalid label...\n"; status = A2.fetchRemoteMD("INVALID_AGENT", &fetch_params); - CHECK_NIXL_ERROR_AGENT(status, "Failed to fetch remote MD", AGENT2_NAME); + nixl_exit_on_failure(status, "Failed to fetch remote MD", AGENT2_NAME); std::this_thread::sleep_for(std::chrono::seconds(1)); diff --git a/examples/cpp/nixl_example.cpp b/examples/cpp/nixl_example.cpp index b42972810..03e8b8833 100644 --- a/examples/cpp/nixl_example.cpp +++ b/examples/cpp/nixl_example.cpp @@ -21,7 +21,7 @@ #include #include "nixl.h" -#include "common/util.h" +#include "test_utils.h" std::string agent1("Agent001"); @@ -31,7 +31,7 @@ void check_buf(void* buf, size_t len) { // Do some checks on the data. for(size_t i = 0; i plugins; ret1 = A1.getAvailPlugins(plugins); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get available plugins", agent1); + nixl_exit_on_failure(ret1, "Failed to get available plugins", agent1); std::cout << "Available plugins:\n"; @@ -102,8 +102,8 @@ main(int argc, char **argv) { ret1 = A1.getPluginParams(backend, mems1, init1); ret2 = A2.getPluginParams(backend, mems2, init2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get plugin params", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get plugin params", agent2); + nixl_exit_on_failure(ret1, "Failed to get plugin params", agent1); + nixl_exit_on_failure(ret2, "Failed to get plugin params", agent2); std::cout << "Params before init:\n"; printParams(init1, mems1); @@ -117,14 +117,14 @@ main(int argc, char **argv) { extra_params1.backends.push_back(bknd1); extra_params2.backends.push_back(bknd2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create backend", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to create backend", agent2); + nixl_exit_on_failure(ret1, "Failed to create " + backend + " backend", agent1); + nixl_exit_on_failure(ret2, "Failed to create " + backend + " backend", agent2); ret1 = A1.getBackendParams(bknd1, mems1, init1); ret2 = A2.getBackendParams(bknd2, mems2, init2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get backend params", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get backend params", agent2); + nixl_exit_on_failure(ret1, "Failed to get " + backend + " backend params", agent1); + nixl_exit_on_failure(ret2, "Failed to get " + backend + " backend params", agent2); std::cout << "Params after init:\n"; printParams(init1, mems1); @@ -163,22 +163,22 @@ main(int argc, char **argv) { ret1 = A1.registerMem(dlist1, &extra_params1); ret2 = A2.registerMem(dlist2, &extra_params2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to register memory", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to register memory", agent2); + nixl_exit_on_failure(ret1, "Failed to register memory", agent1); + nixl_exit_on_failure(ret2, "Failed to register memory", agent2); std::string meta1; ret1 = A1.getLocalMD(meta1); std::string meta2; ret2 = A2.getLocalMD(meta2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get local MD", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get local MD", agent2); + nixl_exit_on_failure(ret1, "Failed to get local MD", agent1); + nixl_exit_on_failure(ret2, "Failed to get local MD", agent2); std::cout << "Agent1's Metadata: " << meta1 << "\n"; std::cout << "Agent2's Metadata: " << meta2 << "\n"; ret1 = A1.loadRemoteMD (meta2, ret_s1); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to load remote MD", agent1); + nixl_exit_on_failure(ret1, "Failed to load remote MD", agent1); size_t req_size = 8; size_t dst_offset = 8; @@ -203,10 +203,10 @@ main(int argc, char **argv) { extra_params1.notifMsg = "notification"; extra_params1.hasNotif = true; ret1 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_dst_descs, agent2, req_handle, &extra_params1); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create Xfer Req", agent1); + nixl_exit_on_failure(ret1, "Failed to create Xfer Req", agent1); nixl_status_t status = A1.postXferReq(req_handle); - CHECK_NIXL_ERROR_AGENT((status > NIXL_IN_PROG), "Failed to post Xfer Req", agent1); + nixl_exit_on_failure((status >= NIXL_SUCCESS), "Failed to post Xfer Req", agent1); std::cout << "Transfer was posted\n"; @@ -216,15 +216,15 @@ main(int argc, char **argv) { while (status != NIXL_SUCCESS || n_notifs == 0) { if (status != NIXL_SUCCESS) status = A1.getXferStatus(req_handle); if (n_notifs == 0) ret2 = A2.getNotifs(notif_map); - CHECK_NIXL_ERROR_AGENT((status > NIXL_IN_PROG), "Failed to post Xfer Req", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get notifs", agent2); + nixl_exit_on_failure((status >= NIXL_SUCCESS), "Failed to post Xfer Req", agent1); + nixl_exit_on_failure(ret2, "Failed to get notifs", agent2); n_notifs = notif_map.size(); } std::vector agent1_notifs = notif_map[agent1]; - CHECK_NIXL_ERROR_AGENT((agent1_notifs.size() != 1), "Incorrect notif size", agent1); - CHECK_NIXL_ERROR_AGENT( - (agent1_notifs.front() != "notification"), "Incorrect notification", agent1); + nixl_exit_on_failure((agent1_notifs.size() == 1), "Incorrect notif size", agent1); + nixl_exit_on_failure( + (agent1_notifs.front() == "notification"), "Incorrect notification", agent1); notif_map[agent1].clear(); // Redundant, for testing notif_map.clear(); @@ -233,16 +233,16 @@ main(int argc, char **argv) { std::cout << "Transfer verified\n"; ret1 = A1.releaseXferReq(req_handle); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to release Xfer Req", agent1); + nixl_exit_on_failure(ret1, "Failed to release Xfer Req", agent1); ret1 = A1.deregisterMem(dlist1, &extra_params1); ret2 = A2.deregisterMem(dlist2, &extra_params2); - CHECK_NIXL_ERROR_AGENT(ret1, "F ailed to deregister memory", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to deregister memory", agent2); + nixl_exit_on_failure(ret1, "Failed to deregister memory", agent1); + nixl_exit_on_failure(ret2, "Failed to deregister memory", agent2); //only initiator should call invalidate ret1 = A1.invalidateRemoteMD(agent2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to invalidate remote MD", agent1); + nixl_exit_on_failure(ret1, "Failed to invalidate remote MD", agent1); free(addr1); free(addr2); diff --git a/src/infra/meson.build b/src/infra/meson.build index 6a2b4352c..ec33e6959 100644 --- a/src/infra/meson.build +++ b/src/infra/meson.build @@ -21,3 +21,6 @@ nixl_build_lib = library('nixl_build', install: true) nixl_infra = declare_dependency(link_with: nixl_build_lib) + +# Test utilities library that can depend on nixl_dep (created after nixl_dep is defined) +# This will be defined in a separate meson file to avoid circular dependencies diff --git a/src/infra/test_utils.cpp b/src/infra/test_utils.cpp new file mode 100644 index 000000000..f0b014f83 --- /dev/null +++ b/src/infra/test_utils.cpp @@ -0,0 +1,37 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "test_utils.h" +#include "nixl_types.h" +#include "common/nixl_log.h" +#include + +void +nixl_exit_on_failure(nixl_status_t status, std::string_view message, std::string_view agent) { + if (status == NIXL_SUCCESS) return; + + NIXL_ERROR << message << (agent.empty() ? "" : " for agent " + std::string{agent}) << ": " + << nixlEnumStrings::statusStr(status) << " [" << status << "]"; + exit(EXIT_FAILURE); +} + +void +nixl_exit_on_failure(bool condition, std::string_view message, std::string_view agent) { + if (condition) return; + + NIXL_ERROR << message << (agent.empty() ? "" : " for agent " + std::string{agent}); + exit(EXIT_FAILURE); +} diff --git a/src/infra/test_utils.h b/src/infra/test_utils.h new file mode 100644 index 000000000..92e77175c --- /dev/null +++ b/src/infra/test_utils.h @@ -0,0 +1,49 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef NIXL_TEST_UTILS_H +#define NIXL_TEST_UTILS_H + +#include +#include "nixl_types.h" + +/** + * @brief Exit on failure utility functions for tests and examples + * + * These functions provide a convenient way to check conditions and exit + * with appropriate error messages if they fail. They are designed for + * use in tests and examples where immediate termination on error is desired. + */ + +/** + * @brief Exit if nixl_status_t indicates failure + * @param status The nixl status to check + * @param message Error message to display + * @param agent Optional agent name for context + */ +void +nixl_exit_on_failure(nixl_status_t status, std::string_view message, std::string_view agent = {}); + +/** + * @brief Exit if boolean condition is false + * @param condition The condition to check (exits if false) + * @param message Error message to display + * @param agent Optional agent name for context + */ +void +nixl_exit_on_failure(bool condition, std::string_view message, std::string_view agent = {}); + +#endif /* NIXL_TEST_UTILS_H */ diff --git a/src/meson.build b/src/meson.build index 50a6fddc8..9cc396543 100644 --- a/src/meson.build +++ b/src/meson.build @@ -21,4 +21,18 @@ subdir('utils') subdir('infra') subdir('plugins') subdir('core') + +# Test utilities library - created after nixl_dep is available to avoid circular dependencies +nixl_test_utils_lib = library('nixl_test_utils', + 'infra/test_utils.cpp', + include_directories: [ nixl_inc_dirs, utils_inc_dirs ], + dependencies: [nixl_dep, nixl_common_dep, absl_log_dep], + install: true) + +nixl_test_utils_dep = declare_dependency( + include_directories: include_directories('infra'), + link_with: nixl_test_utils_lib, + dependencies: [nixl_dep, nixl_common_dep] +) + subdir('bindings') diff --git a/src/utils/common/util.h b/src/utils/common/util.h index b14f759dc..bffa2efa1 100644 --- a/src/utils/common/util.h +++ b/src/utils/common/util.h @@ -21,21 +21,4 @@ #define CONCAT_0(a, b) a ## b #define UNIQUE_NAME(name) CONCAT(name, __COUNTER__) -#define CHECK_NIXL_ERROR_AGENT(result, message, agent) \ - do { \ - if (0 != result) { \ - std::cerr << "NIXL: " << message << " for agent " << agent \ - << " (Error code: " << result << ")" << std::endl; \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -#define CHECK_NIXL_ERROR(result, message) \ - do { \ - if (0 != result) { \ - std::cerr << "NIXL: " << message << " (Error code: " << result << ")" << std::endl; \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - #endif /* UTIL_H */ diff --git a/test/nixl/agent_example.cpp b/test/nixl/agent_example.cpp index 319be6990..a64e715eb 100644 --- a/test/nixl/agent_example.cpp +++ b/test/nixl/agent_example.cpp @@ -20,7 +20,7 @@ #include #include "nixl.h" -#include "common/util.h" +#include "test_utils.h" std::string agent1("Agent001"); std::string agent2("Agent002"); @@ -29,7 +29,7 @@ void check_buf(void* buf, size_t len) { // Do some checks on the data. for(size_t i = 0; iregisterMem(mem_list1, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent1); + nixl_exit_on_failure(status, "Failed to register memory", agent1); status = A2->registerMem(mem_list2, &extra_params2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent2); + nixl_exit_on_failure(status, "Failed to register memory", agent2); std::string meta2; status = A2->getLocalMD(meta2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to get local MD", agent2); - CHECK_NIXL_ERROR_AGENT((meta2.size() <= 0), "Incorrect local MD", agent2); + nixl_exit_on_failure(status, "Failed to get local MD", agent2); + nixl_exit_on_failure((meta2.size() > 0), "Incorrect local MD", agent2); std::string remote_name; status = A1->loadRemoteMD(meta2, remote_name); - CHECK_NIXL_ERROR_AGENT(status, "Failed to local remote MD", agent1); - CHECK_NIXL_ERROR_AGENT((remote_name != agent2), "Incorrect remote MD received", agent1); + nixl_exit_on_failure(status, "Failed to local remote MD", agent1); + nixl_exit_on_failure((remote_name == agent2), "Incorrect remote MD received", agent1); std::cout << "perf setup done\n"; @@ -116,9 +116,9 @@ void test_side_perf(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend, nixlBac for(int i = 0; iprepXferDlist(agent2, dst_list, dst_side[i], &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to prep Xfer Dlist for dest", agent1); + nixl_exit_on_failure(status, "Failed to prep Xfer Dlist for dest", agent1); status = A1->prepXferDlist(NIXL_INIT_AGENT, src_list, src_side[i], &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to pre Xfer Dlist for src", agent1); + nixl_exit_on_failure(status, "Failed to pre Xfer Dlist for src", agent1); } gettimeofday(&end_time, NULL); @@ -143,7 +143,7 @@ void test_side_perf(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend, nixlBac extra_params1.notifMsg = "test"; extra_params1.hasNotif = true; status = A1->makeXferReq(NIXL_WRITE, src_side[0], indices, dst_side[0], indices, reqh1, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to make Xfer Req", agent1); + nixl_exit_on_failure(status, "Failed to make Xfer Req", agent1); indices.clear(); for(int i = 0; i<(n_mems*descs_per_mem); i+=2) @@ -151,24 +151,24 @@ void test_side_perf(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend, nixlBac //should print (n_mems*descs_per_mem/2) number of final descriptors status = A1->makeXferReq(NIXL_WRITE, src_side[0], indices, dst_side[0], indices, reqh2, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to make Xfer Req", agent1); + nixl_exit_on_failure(status, "Failed to make Xfer Req", agent1); status = A1->releaseXferReq(reqh1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release Xfer Req", agent1); + nixl_exit_on_failure(status, "Failed to release Xfer Req", agent1); status = A1->releaseXferReq(reqh2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release Xfer Req2", agent1); + nixl_exit_on_failure(status, "Failed to release Xfer Req2", agent1); // Commented out to test auto deregistration // status = A1->deregisterMem(mem_list1, &extra_params1); // assert (status == NIXL_SUCCESS); status = A2->deregisterMem(mem_list2, &extra_params2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to deregister memory", agent2); + nixl_exit_on_failure(status, "Failed to deregister memory", agent2); for(int i = 0; ireleasedDlistH(src_side[i]); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release src Dlist handle", agent1); + nixl_exit_on_failure(status, "Failed to release src Dlist handle", agent1); status = A1->releasedDlistH(dst_side[i]); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release dst Dlist handle", agent1); + nixl_exit_on_failure(status, "Failed to release dst Dlist handle", agent1); } free(src_buf); @@ -215,9 +215,9 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Register memory for each update for (int update = 0; update < NUM_UPDATES; update++) { status = A1->registerMem(src_mem_lists[update], &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent1); + nixl_exit_on_failure(status, "Failed to register memory", agent1); status = A2->registerMem(dst_mem_lists[update], &extra_params2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent2); + nixl_exit_on_failure(status, "Failed to register memory", agent2); } // Test metadata update with only backends and empty descriptor list @@ -226,31 +226,31 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Agent2 might have already been previously loaded. // Invalidate it just in case but don't care either way. status = A1->invalidateRemoteMD(agent2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to invalidate remote MD", agent1); + nixl_exit_on_failure(status, "Failed to invalidate remote MD", agent1); nixl_reg_dlist_t empty_dlist(DRAM_SEG); std::string partial_meta; status = A2->getLocalPartialMD(empty_dlist, partial_meta, NULL); - CHECK_NIXL_ERROR_AGENT(status, "Failed to get local partial MD", agent2); - CHECK_NIXL_ERROR_AGENT((partial_meta.size() <= 0), "Incorrect local partial MD", agent2); + nixl_exit_on_failure(status, "Failed to get local partial MD", agent2); + nixl_exit_on_failure((partial_meta.size() > 0), "Incorrect local partial MD", agent2); std::string remote_name; status = A1->loadRemoteMD(partial_meta, remote_name); - CHECK_NIXL_ERROR_AGENT(status, "Failed to get load remote MD", agent1); - CHECK_NIXL_ERROR_AGENT((remote_name != agent2), "Incorrect remote MD", agent1); + nixl_exit_on_failure(status, "Failed to get load remote MD", agent1); + nixl_exit_on_failure((remote_name == agent2), "Incorrect remote MD", agent1); // Make sure unregistered descriptors are not updated for (int update = 0; update < NUM_UPDATES; update++) { nixlDlistH *dst_side; status = A1->prepXferDlist(agent2, dst_mem_lists[update].trim(), dst_side, &extra_params1); - CHECK_NIXL_ERROR_AGENT( - (status == NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); - CHECK_NIXL_ERROR_AGENT((dst_side != nullptr), "Dst side is not null", agent1); + nixl_exit_on_failure( + (status != NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); + nixl_exit_on_failure((dst_side == nullptr), "Dst side is not null", agent1); } // Invalidate remote agent metadata to make sure we received connection info status = A1->invalidateRemoteMD(agent2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to get invalidate remote MD", agent1); + nixl_exit_on_failure(status, "Failed to get invalidate remote MD", agent1); std::cout << "Metadata update - backends only completed\n"; // Main test loop - update metadata multiple times @@ -263,26 +263,26 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 std::cout << "Metadata update #" << update << "\n"; // Get partial metadata from A2 status = A2->getLocalPartialMD(dst_mem_lists[update], partial_meta, &extra_params2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to get local partial MD", agent2); - CHECK_NIXL_ERROR_AGENT((partial_meta.size() <= 0), "Incorrect local partial MD", agent2); + nixl_exit_on_failure(status, "Failed to get local partial MD", agent2); + nixl_exit_on_failure((partial_meta.size() > 0), "Incorrect local partial MD", agent2); // Load the partial metadata into A1 std::string remote_name; status = A1->loadRemoteMD(partial_meta, remote_name); - CHECK_NIXL_ERROR_AGENT(status, "Failed to load remote MD", agent1); - CHECK_NIXL_ERROR_AGENT((remote_name != agent2), "Incorrect remote MD", agent1); + nixl_exit_on_failure(status, "Failed to load remote MD", agent1); + nixl_exit_on_failure((remote_name == agent2), "Incorrect remote MD", agent1); // Make sure loaded descriptors are updated nixlDlistH *dst_side; status = A1->prepXferDlist(agent2, dst_mem_lists[update].trim(), dst_side, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to prep xfer dlist", agent1); - CHECK_NIXL_ERROR_AGENT((dst_side == nullptr), "Dst side is null", agent1); + nixl_exit_on_failure(status, "Failed to prep xfer dlist", agent1); + nixl_exit_on_failure((dst_side == nullptr), "Dst side is null", agent1); // Make sure not-loaded descriptors are not updated for (int invalid_idx = update + 1; invalid_idx < NUM_UPDATES; invalid_idx++) { status = A1->prepXferDlist(agent2, dst_mem_lists[invalid_idx].trim(), dst_side, &extra_params1); - CHECK_NIXL_ERROR_AGENT( - (status == NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); - CHECK_NIXL_ERROR_AGENT((dst_side != nullptr), "Dst side is not null", agent1); + nixl_exit_on_failure( + (status != NIXL_SUCCESS), "Prep xfer dlist should not be successful", agent1); + nixl_exit_on_failure((dst_side == nullptr), "Dst side is not null", agent1); } std::cout << "Metadata update #" << update << " completed\n"; } @@ -304,11 +304,10 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 nixlDlistH *src_side, *dst_side; status = A1->prepXferDlist(NIXL_INIT_AGENT, src_xfer_list, src_side, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to prep xfer dlist", agent1); - CHECK_NIXL_ERROR_AGENT((src_side == nullptr), "Src side is null", agent1); + nixl_exit_on_failure(status, "Failed to prep xfer dlist", agent1); + status = A1->prepXferDlist(agent2, dst_xfer_list, dst_side, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to prep xfer dlist", agent1); - CHECK_NIXL_ERROR_AGENT((dst_side == nullptr), "Dst side is null", agent1); + nixl_exit_on_failure(status, "Failed to prep xfer dlist", agent1); std::cout << "Transfer preparation completed\n"; @@ -324,13 +323,13 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Create and post the transfer request status = A1->makeXferReq(NIXL_WRITE, src_side, indices, dst_side, indices, req, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to make xfer req", agent1); + nixl_exit_on_failure(status, "Failed to make xfer req", agent1); nixl_status_t xfer_status = A1->postXferReq(req); // Wait for transfer completion while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req); - CHECK_NIXL_ERROR_AGENT((xfer_status > NIXL_IN_PROG), "Failed to get xfer status", agent1); + nixl_exit_on_failure((xfer_status >= 0), "Failed to get xfer status", agent1); } // Verify transfer results @@ -344,17 +343,17 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 // Cleanup status = A1->releaseXferReq(req); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer req", agent1); + nixl_exit_on_failure(status, "Failed to release xfer req", agent1); status = A1->releasedDlistH(src_side); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer dlist", agent1); + nixl_exit_on_failure(status, "Failed to release xfer dlist", agent1); status = A1->releasedDlistH(dst_side); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer dlist", agent1); + nixl_exit_on_failure(status, "Failed to release xfer dlist", agent1); // Deregister memory for (int update = 0; update < NUM_UPDATES; update++) { status = A1->deregisterMem(src_mem_lists[update], &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to deregister memory", agent1); + nixl_exit_on_failure(status, "Failed to deregister memory", agent1); status = A2->deregisterMem(dst_mem_lists[update], &extra_params2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to deregister memory", agent2); + nixl_exit_on_failure(status, "Failed to deregister memory", agent2); } // Free allocated memory @@ -379,8 +378,8 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl extra_params1.backends.push_back(src_backend); extra_params2.backends.push_back(dst_backend); - CHECK_NIXL_ERROR_AGENT(status, "Failed to query xfer backend", agent1); - CHECK_NIXL_ERROR_AGENT((src_backend == nullptr), "Incorrect src backend handle", agent1); + nixl_exit_on_failure(status, "Failed to query xfer backend", agent1); + nixl_exit_on_failure((src_backend != nullptr), "Incorrect src backend handle", agent1); std::cout << "Got backend\n"; @@ -415,27 +414,27 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl dst_list = mem_list2.trim(); status = A1->registerMem(mem_list1, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent1); + nixl_exit_on_failure(status, "Failed to register memory", agent1); status = A2->registerMem(mem_list2, &extra_params2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to register memory", agent2); + nixl_exit_on_failure(status, "Failed to register memory", agent2); std::string meta2; status = A2->getLocalMD(meta2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to get local MD", agent2); - CHECK_NIXL_ERROR_AGENT((meta2.size() <= 0), "Incorrect local MD", agent2); + nixl_exit_on_failure(status, "Failed to get local MD", agent2); + nixl_exit_on_failure((meta2.size() > 0), "Incorrect local MD", agent2); std::string remote_name; status = A1->loadRemoteMD(meta2, remote_name); - CHECK_NIXL_ERROR_AGENT(status, "Failed to load remote MD", agent1); - CHECK_NIXL_ERROR_AGENT((remote_name != agent2), "Incorrect remote MD", agent1); + nixl_exit_on_failure(status, "Failed to load remote MD", agent1); + nixl_exit_on_failure((remote_name == agent2), "Incorrect remote MD", agent1); std::cout << "Ready to prepare side\n"; nixlDlistH *src_side, *dst_side; status = A1->prepXferDlist(NIXL_INIT_AGENT, src_list, src_side, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to prep xfer dlist", agent1); - CHECK_NIXL_ERROR_AGENT((src_side == nullptr), "Src side is null", agent1); + nixl_exit_on_failure(status, "Failed to prep xfer dlist", agent1); + status = A1->prepXferDlist(remote_name, dst_list, dst_side, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to prep xfer dlist", agent1); - CHECK_NIXL_ERROR_AGENT((dst_side == nullptr), "Dst side is null", agent1); + nixl_exit_on_failure(status, "Failed to prep xfer dlist", agent1); + std::cout << "prep done, starting transfers\n"; std::vector indices1, indices2; @@ -452,12 +451,14 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl //write first half of src_bufs to dst_bufs status = A1->makeXferReq(NIXL_WRITE, src_side, indices1, dst_side, indices1, req1, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to make xfer req", agent1); + nixl_exit_on_failure(status, "Failed to make xfer req", agent1); nixl_status_t xfer_status = A1->postXferReq(req1); while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req1); - CHECK_NIXL_ERROR_AGENT((xfer_status > NIXL_IN_PROG), "Failed to get xfer status", agent1); + nixl_exit_on_failure((xfer_status == NIXL_SUCCESS || xfer_status == NIXL_IN_PROG), + "Failed to get xfer status", + agent1); } for(int i = 0; i<(n_bufs/2); i++) @@ -467,12 +468,14 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl //read first half of dst_bufs back to second half of src_bufs status = A1->makeXferReq(NIXL_READ, src_side, indices2, dst_side, indices1, req2, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to make xfer req", agent1); + nixl_exit_on_failure(status, "Failed to make xfer req", agent1); xfer_status = A1->postXferReq(req2); while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req2); - CHECK_NIXL_ERROR_AGENT((xfer_status > NIXL_IN_PROG), "Failed to get xfer status", agent1); + nixl_exit_on_failure((xfer_status == NIXL_SUCCESS || xfer_status == NIXL_IN_PROG), + "Failed to get xfer status", + agent1); } for(int i = (n_bufs/2); imakeXferReq(NIXL_WRITE, src_side, indices2, dst_side, indices2, req3, &extra_params1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to make xfer req", agent1); + nixl_exit_on_failure(status, "Failed to make xfer req", agent1); xfer_status = A1->postXferReq(req3); while (xfer_status != NIXL_SUCCESS) { if (xfer_status != NIXL_SUCCESS) xfer_status = A1->getXferStatus(req3); - CHECK_NIXL_ERROR_AGENT((xfer_status > NIXL_IN_PROG), "Failed to get xfer status", agent1); + nixl_exit_on_failure((xfer_status == NIXL_SUCCESS || xfer_status == NIXL_IN_PROG), + "Failed to get xfer status", + agent1); } for(int i = (n_bufs/2); ireleaseXferReq(req1); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer req", agent1); + nixl_exit_on_failure(status, "Failed to release xfer req", agent1); status = A1->releaseXferReq(req2); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer req2", agent1); + nixl_exit_on_failure(status, "Failed to release xfer req2", agent1); status = A1->releaseXferReq(req3); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer req3", agent1); + nixl_exit_on_failure(status, "Failed to release xfer req3", agent1); // Commented out to test auto deregistration // status = A1->deregisterMem(mem_list1, &extra_params1); @@ -509,9 +514,9 @@ nixl_status_t sideXferTest(nixlAgent* A1, nixlAgent* A2, nixlXferReqH* src_handl // assert (status == NIXL_SUCCESS); status = A1->releasedDlistH(src_side); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer src dlist", agent1); + nixl_exit_on_failure(status, "Failed to release xfer src dlist", agent1); status = A1->releasedDlistH(dst_side); - CHECK_NIXL_ERROR_AGENT(status, "Failed to release xfer dst dlist", agent1); + nixl_exit_on_failure(status, "Failed to release xfer dst dlist", agent1); for(int i = 0; i plugins; ret1 = A1.getAvailPlugins(plugins); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get available plugins", agent1); + nixl_exit_on_failure(ret1, "Failed to get available plugins", agent1); std::cout << "Available plugins:\n"; @@ -577,8 +582,8 @@ main(int argc, char **argv) { ret1 = A1.getPluginParams("UCX", mems1, init1); ret2 = A2.getPluginParams("UCX", mems2, init2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get plugin params for UCX", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get plugin params for UCX", agent2); + nixl_exit_on_failure(ret1, "Failed to get plugin params for UCX", agent1); + nixl_exit_on_failure(ret2, "Failed to get plugin params for UCX", agent2); std::cout << "Params before init:\n"; printParams(init1, mems1); @@ -592,13 +597,13 @@ main(int argc, char **argv) { extra_params1.backends.push_back(bknd1); extra_params2.backends.push_back(bknd2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create UCX backend", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to create UCX backend", agent2); + nixl_exit_on_failure(ret1, "Failed to create " + backend + " backend", agent1); + nixl_exit_on_failure(ret2, "Failed to create " + backend + " backend", agent2); ret1 = A1.getBackendParams(bknd1, mems1, init1); ret2 = A2.getBackendParams(bknd2, mems2, init2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get UCX backend params", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get UCX backend params", agent2); + nixl_exit_on_failure(ret1, "Failed to get " + backend + " backend params", agent1); + nixl_exit_on_failure(ret2, "Failed to get " + backend + " backend params", agent2); std::cout << "Params after init:\n"; printParams(init1, mems1); @@ -640,23 +645,23 @@ main(int argc, char **argv) { ret1 = A1.registerMem(dlist1, &extra_params1); ret2 = A2.registerMem(dlist2, &extra_params2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to register memory", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to register memory", agent2); + nixl_exit_on_failure(ret1, "Failed to register memory", agent1); + nixl_exit_on_failure(ret2, "Failed to register memory", agent2); std::string meta1; ret1 = A1.getLocalMD(meta1); std::string meta2; ret2 = A2.getLocalMD(meta2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to get local MD", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get local MD", agent2); + nixl_exit_on_failure(ret1, "Failed to get local MD", agent1); + nixl_exit_on_failure(ret2, "Failed to get local MD", agent2); std::cout << "Agent1's Metadata: " << meta1 << "\n"; std::cout << "Agent2's Metadata: " << meta2 << "\n"; ret1 = A1.loadRemoteMD (meta2, ret_s1); ret2 = A2.loadRemoteMD (meta1, ret_s2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to load remote MD", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to load remote MD", agent2); + nixl_exit_on_failure(ret1, "Failed to load remote MD", agent1); + nixl_exit_on_failure(ret2, "Failed to load remote MD", agent2); size_t req_size = 8; size_t dst_offset = 8; @@ -688,7 +693,7 @@ main(int argc, char **argv) { extra_params1.notifMsg = "notification"; extra_params1.hasNotif = true; ret1 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_dst_descs, agent2, req_handle, &extra_params1); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create Xfer Req", agent1); + nixl_exit_on_failure(ret1, "Failed to create Xfer Req", agent1); nixl_status_t status = A1.postXferReq(req_handle); @@ -700,16 +705,17 @@ main(int argc, char **argv) { while (status != NIXL_SUCCESS || n_notifs == 0) { if (status != NIXL_SUCCESS) status = A1.getXferStatus(req_handle); if (n_notifs == 0) ret2 = A2.getNotifs(notif_map); - CHECK_NIXL_ERROR_AGENT((status < 0), "Failed to post Xfer Req", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get notifs", agent2); + nixl_exit_on_failure( + (status == NIXL_SUCCESS || status == NIXL_IN_PROG), "Failed to post Xfer Req", agent1); + nixl_exit_on_failure(ret2, "Failed to get notifs", agent2); n_notifs = notif_map.size(); } std::vector agent1_notifs = notif_map[agent1]; - CHECK_NIXL_ERROR_AGENT((agent1_notifs.size() != 1), "Incorrect notif size", agent1); - CHECK_NIXL_ERROR_AGENT( - (agent1_notifs.front() != "notification"), "Incorrect notification", agent1); + nixl_exit_on_failure((agent1_notifs.size() == 1), "Incorrect notif size", agent1); + nixl_exit_on_failure( + (agent1_notifs.front() == "notification"), "Incorrect notification", agent1); notif_map[agent1].clear(); // Redundant, for testing notif_map.clear(); n_notifs = 0; @@ -718,17 +724,17 @@ main(int argc, char **argv) { std::cout << "performing partialMdTest with backends " << bknd1 << " " << bknd2 << "\n"; ret1 = partialMdTest(&A1, &A2, bknd1, bknd2); - CHECK_NIXL_ERROR_AGENT(ret1, "Fail to run partialMDTest", agent1); + nixl_exit_on_failure(ret1, "Fail to run partialMDTest", agent1); std::cout << "performing sideXferTest with backends " << bknd1 << " " << bknd2 << "\n"; ret1 = sideXferTest(&A1, &A2, req_handle, bknd2); - CHECK_NIXL_ERROR_AGENT(ret1, "Fail to run sideXferTest", agent1); + nixl_exit_on_failure(ret1, "Fail to run sideXferTest", agent1); std::cout << "Performing local test\n"; extra_params1.notifMsg = "local_notif"; extra_params1.hasNotif = true; ret2 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_ldst_descs, agent1, req_handle2, &extra_params1); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to create Xfer Req", agent1); + nixl_exit_on_failure(ret1, "Failed to create Xfer Req", agent1); status = A1.postXferReq(req_handle2); std::cout << "Local transfer was posted\n"; @@ -736,34 +742,34 @@ main(int argc, char **argv) { while (status != NIXL_SUCCESS || n_notifs == 0) { if (status != NIXL_SUCCESS) status = A1.getXferStatus(req_handle2); if (n_notifs == 0) ret2 = A1.getNotifs(notif_map); - CHECK_NIXL_ERROR_AGENT((status < 0), "Failed to post Xfer Req", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to get notifs", agent2); + nixl_exit_on_failure( + (status == NIXL_SUCCESS || status == NIXL_IN_PROG), "Failed to post Xfer Req", agent1); + nixl_exit_on_failure(ret2, "Failed to get notifs", agent2); n_notifs = notif_map.size(); } agent1_notifs = notif_map[agent1]; - CHECK_NIXL_ERROR_AGENT((agent1_notifs.size() != 1), "Incorrect notif size", agent1); - CHECK_NIXL_ERROR_AGENT( - (agent1_notifs.front() != "local_notif"), "Incorrect notification", agent1); - CHECK_NIXL_ERROR_AGENT( - (!equal_buf((void *)req_src.addr, (void *)req_ldst.addr, req_size) == true), - "Buffer mismatch after transfer", - agent1); + nixl_exit_on_failure((agent1_notifs.size() == 1), "Incorrect notif size", agent1); + nixl_exit_on_failure( + (agent1_notifs.front() == "local_notif"), "Incorrect notification", agent1); + nixl_exit_on_failure((equal_buf((void *)req_src.addr, (void *)req_ldst.addr, req_size) == true), + "Buffer mismatch after transfer", + agent1); ret1 = A1.releaseXferReq(req_handle); ret2 = A1.releaseXferReq(req_handle2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to release Xfer Req", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to release Xfer Req2", agent1); + nixl_exit_on_failure(ret1, "Failed to release Xfer Req", agent1); + nixl_exit_on_failure(ret2, "Failed to release Xfer Req2", agent1); ret1 = A1.deregisterMem(dlist1, &extra_params1); ret2 = A2.deregisterMem(dlist2, &extra_params2); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to deregister memory", agent1); - CHECK_NIXL_ERROR_AGENT(ret2, "Failed to deregister memory", agent2); + nixl_exit_on_failure(ret1, "Failed to deregister memory", agent1); + nixl_exit_on_failure(ret2, "Failed to deregister memory", agent2); //only initiator should call invalidate ret1 = A1.invalidateRemoteMD(agent2); //A2.invalidateRemoteMD(agent1); - CHECK_NIXL_ERROR_AGENT(ret1, "Failed to invalidate remote MD", agent1); + nixl_exit_on_failure(ret1, "Failed to invalidate remote MD", agent1); free(addr1); free(addr2); diff --git a/test/nixl/desc_example.cpp b/test/nixl/desc_example.cpp index 93adebbfa..d7fe89d69 100644 --- a/test/nixl/desc_example.cpp +++ b/test/nixl/desc_example.cpp @@ -20,7 +20,7 @@ #include "nixl.h" #include "serdes/serdes.h" #include "backend/backend_aux.h" -#include "common/util.h" +#include "test_utils.h" #include @@ -39,7 +39,7 @@ void testPerf(){ gettimeofday(&end_time, NULL); - CHECK_NIXL_ERROR((dlist.descCount() != 24 * 64 * 1024), "Incorrect number of descriptors"); + nixl_exit_on_failure((dlist.descCount() == 24 * 64 * 1024), "Incorrect number of descriptors"); timersub(&end_time, &start_time, &diff_time); std::cout << "add desc mode, total time for " << 24*64*1024 << " descs: " << diff_time.tv_sec << "s " << diff_time.tv_usec << "us \n"; @@ -58,7 +58,7 @@ void testPerf(){ gettimeofday(&end_time, NULL); - CHECK_NIXL_ERROR((dlist.descCount() != 24 * 64 * 1024), "Incorrect number of descriptors"); + nixl_exit_on_failure((dlist.descCount() == 24 * 64 * 1024), "Incorrect number of descriptors"); timersub(&end_time, &start_time, &diff_time); std::cout << "Operator [] mode, total time for " << 24*64*1024 << " descs: " << diff_time.tv_sec << "s " << diff_time.tv_usec << "us \n"; @@ -88,27 +88,27 @@ int main() nixlBasicDesc buff8 (1010,31,0); nixlBasicDesc importDesc(buff2.serialize()); - CHECK_NIXL_ERROR(!(buff2 == importDesc), "Descriptor mismatch for buff2 and importDesc"); - - CHECK_NIXL_ERROR(!(buff3 == buff2), "Descriptor mismatch for buff3 and buff2"); - CHECK_NIXL_ERROR(!(buff4 == buff1), "Descriptor mismatch for buff4 and buff1"); - CHECK_NIXL_ERROR(!(buff3 != buff1), "Descriptor mismatch for buff3 and buff1"); - CHECK_NIXL_ERROR(!(buff8 != buff7), "Descriptor mismatch for buff8 and buff7"); - - CHECK_NIXL_ERROR(!(buff2.covers(buff3)), "Descriptor mismatch for buff2 and buff3"); - CHECK_NIXL_ERROR(!(buff4.overlaps(buff1)), "Descriptor mismatch for buff4 and buff1"); - CHECK_NIXL_ERROR((buff1.covers(buff2)), "Descriptor mismatch for buff1 and buff2"); - CHECK_NIXL_ERROR((buff1.overlaps(buff2)), "Descriptor mismatch for buff1 and buff2"); - CHECK_NIXL_ERROR((buff2.covers(buff1)), "Descriptor mismatch for buff2 and buff1"); - CHECK_NIXL_ERROR((buff2.overlaps(buff1)), "Descriptor mismatch for buff2 and buff1"); - CHECK_NIXL_ERROR(!(buff2.overlaps(buff5)), "Descriptor mismatch for buff2 and buff5"); - CHECK_NIXL_ERROR(!(buff5.overlaps(buff2)), "Descriptor mismatch for buff5 and buff2"); - CHECK_NIXL_ERROR((buff2.covers(buff5)), "Descriptor mismatch for buff2 and buff5"); - CHECK_NIXL_ERROR((buff5.covers(buff2)), "Descriptor mismatch for buff5 and buff2"); - CHECK_NIXL_ERROR((buff1.covers(buff6)), "Descriptor mismatch for buff1 and buff6"); - CHECK_NIXL_ERROR((buff6.covers(buff1)), "Descriptor mismatch for buff6 and buff1"); - CHECK_NIXL_ERROR(!(buff1.covers(buff7)), "Descriptor mismatch for buff1 and buff7"); - CHECK_NIXL_ERROR((buff7.covers(buff1)), "Descriptor mismatch for buff7 and buff1"); + nixl_exit_on_failure((buff2 == importDesc), "Descriptor mismatch for buff2 and importDesc"); + + nixl_exit_on_failure((buff3 == buff2), "Descriptor mismatch for buff3 and buff2"); + nixl_exit_on_failure((buff4 == buff1), "Descriptor mismatch for buff4 and buff1"); + nixl_exit_on_failure((buff3 != buff1), "Descriptor mismatch for buff3 and buff1"); + nixl_exit_on_failure((buff8 != buff7), "Descriptor mismatch for buff8 and buff7"); + + nixl_exit_on_failure((buff2.covers(buff3)), "Descriptor buff2 does not cover buff3"); + nixl_exit_on_failure((buff4.overlaps(buff1)), "Descriptor buff4 does not overlap buff1"); + nixl_exit_on_failure(!(buff1.covers(buff2)), "Descriptor buff1 does not cover buff2"); + nixl_exit_on_failure(!(buff1.overlaps(buff2)), "Descriptor buff1 does not overlap buff2"); + nixl_exit_on_failure(!(buff2.covers(buff1)), "Descriptor buff2 does not cover buff1"); + nixl_exit_on_failure(!(buff2.overlaps(buff1)), "Descriptor buff2 does not overlap buff1"); + nixl_exit_on_failure((buff2.overlaps(buff5)), "Descriptor buff2 does not overlap buff5"); + nixl_exit_on_failure((buff5.overlaps(buff2)), "Descriptor buff5 does not overlap buff2"); + nixl_exit_on_failure(!(buff2.covers(buff5)), "Descriptor buff2 does not cover buff5"); + nixl_exit_on_failure(!(buff5.covers(buff2)), "Descriptor buff5 does not cover buff2"); + nixl_exit_on_failure(!(buff1.covers(buff6)), "Descriptor buff1 does not cover buff6"); + nixl_exit_on_failure(!(buff6.covers(buff1)), "Descriptor buff6 does not cover buff1"); + nixl_exit_on_failure((buff1.covers(buff7)), "Descriptor buff1 does not cover buff7"); + nixl_exit_on_failure(!(buff7.covers(buff1)), "Descriptor buff7 does not cover buff1"); nixlBlobDesc stringd1; stringd1.addr = 2392382; @@ -117,8 +117,8 @@ int main() stringd1.metaInfo = std::string("567"); nixlBlobDesc importStringD(stringd1.serialize()); - CHECK_NIXL_ERROR(!(stringd1 == importStringD), - "Descriptor mismatch for stringd1 and importStringD"); + nixl_exit_on_failure((stringd1 == importStringD), + "Descriptor stringd1 does not match importStringD"); std::cout << "\nSerDes Desc tests:\n"; buff2.print(""); @@ -148,8 +148,8 @@ int main() meta2.devId = 0; meta2.metadataP = nullptr; - CHECK_NIXL_ERROR(!(stringd1 != buff1), "Descriptor mismatch for stringd1 and buff1"); - CHECK_NIXL_ERROR(!(stringd2 == buff8), "Descriptor mismatch for stringd2 and buff8"); + nixl_exit_on_failure((stringd1 != buff1), "Descriptor stringd1 matches buff1"); + nixl_exit_on_failure((stringd2 == buff8), "Descriptor stringd2 does not match buff8"); nixlBasicDesc buff9 (stringd1); buff1.print(""); @@ -202,10 +202,10 @@ int main() std::cout << "Caught expected error: " << e.what() << std::endl; } dlist2.remDesc(dlist2.getIndex(meta3)); - CHECK_NIXL_ERROR(!(dlist2.getIndex(meta3) == NIXL_ERR_NOT_FOUND), - "Dlist2 descriptor not removed"); - CHECK_NIXL_ERROR(!(dlist3.getIndex(meta1) == NIXL_ERR_NOT_FOUND), - "Dlist3 descriptor not removed"); + nixl_exit_on_failure((dlist2.getIndex(meta3) == NIXL_ERR_NOT_FOUND), + "Dlist2 descriptor not removed"); + nixl_exit_on_failure((dlist3.getIndex(meta1) == NIXL_ERR_NOT_FOUND), + "Dlist3 descriptor not removed"); try { dlist3.remDesc(dlist3.getIndex(meta4)); } catch (const std::out_of_range& e) { @@ -281,13 +281,13 @@ int main() nixlSerDes* ser_des = new nixlSerDes(); nixlSerDes* ser_des2 = new nixlSerDes(); - CHECK_NIXL_ERROR((dlist10.serialize(ser_des) != 0), "Failed to serialize dlist10"); + nixl_exit_on_failure((dlist10.serialize(ser_des) == 0), "Failed to serialize dlist10"); nixl_xfer_dlist_t importList (ser_des);; - CHECK_NIXL_ERROR(!(importList == dlist10), "Descriptor mismatch for importList and dlist10"); + nixl_exit_on_failure((importList == dlist10), "Descriptor importList does not match dlist10"); - CHECK_NIXL_ERROR((dlist20.serialize(ser_des2) != 0), "Failed to serialize dlist20"); + nixl_exit_on_failure((dlist20.serialize(ser_des2) == 0), "Failed to serialize dlist20"); nixl_reg_dlist_t importSList (ser_des2); - CHECK_NIXL_ERROR(!(importSList == dlist20), "Descriptor mismatch for importSList and dlist20"); + nixl_exit_on_failure((importSList == dlist20), "Descriptor importSList does not match dlist20"); dlist10.print(); std::cout << "this should be a copy:\n"; diff --git a/test/nixl/meson.build b/test/nixl/meson.build index bc6a9c9ff..fe99db3a7 100644 --- a/test/nixl/meson.build +++ b/test/nixl/meson.build @@ -15,26 +15,26 @@ desc_example = executable('desc_example', 'desc_example.cpp', - dependencies: [nixl_dep, nixl_infra], + dependencies: [nixl_dep, nixl_infra, nixl_test_utils_dep], include_directories: [nixl_inc_dirs, utils_inc_dirs], link_with: [serdes_lib], install: true) agent_example = executable('agent_example', 'agent_example.cpp', - dependencies: [nixl_dep, nixl_infra], + dependencies: [nixl_dep, nixl_infra, nixl_test_utils_dep], include_directories: [nixl_inc_dirs, utils_inc_dirs], link_with: [serdes_lib], install: true) nixl_test_app = executable('nixl_test', 'nixl_test.cpp', - dependencies: [nixl_dep, nixl_infra, stream_interface, thread_dep], + dependencies: [nixl_dep, nixl_infra, stream_interface, thread_dep, nixl_test_utils_dep], include_directories: [nixl_inc_dirs, utils_inc_dirs, '../../src/utils/serdes'], link_with: [serdes_lib], install: true) plugin_test = executable('test_plugin', 'test_plugin.cpp', - dependencies: [nixl_dep, nixl_common_dep, cuda_dep], + dependencies: [nixl_dep, nixl_common_dep, cuda_dep, nixl_test_utils_dep], include_directories: [nixl_inc_dirs, utils_inc_dirs], install: true) diff --git a/test/nixl/nixl_test.cpp b/test/nixl/nixl_test.cpp index 93417fbc1..075781ff0 100644 --- a/test/nixl/nixl_test.cpp +++ b/test/nixl/nixl_test.cpp @@ -22,7 +22,7 @@ #include #include #include -#include "common/util.h" +#include "test_utils.h" #include "stream/metadata_stream.h" #include "serdes/serdes.h" #include @@ -84,7 +84,7 @@ static void targetThread(nixlAgent &agent, nixl_opt_args_t *extra_params, int th /** Only send desc list */ nixlSerDes serdes; nixl_status_t st = dram_for_ucx.trim().serialize(&serdes); - CHECK_NIXL_ERROR(st, "Failed to serialize registry dlist"); + nixl_exit_on_failure(st, "Failed to serialize registry dlist"); std::cout << "Thread " << thread_id << " Wait for initiator and then send xfer descs\n"; std::string message = serdes.exportStr(); @@ -144,7 +144,7 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params, nixl_notifs_t notifs; nixl_status_t ret = agent.getNotifs(notifs, extra_params); - CHECK_NIXL_ERROR(ret, "Failed to get notifs"); + nixl_exit_on_failure(ret, "Failed to get notifs"); if (notifs.size() > 0) { std::lock_guard lock(shared_state.mtx); @@ -194,7 +194,7 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params, while (ret != NIXL_SUCCESS) { ret = agent.getXferStatus(treq); - CHECK_NIXL_ERROR((ret > NIXL_IN_PROG), "Failed to get transfer status"); + nixl_exit_on_failure((ret >= NIXL_SUCCESS), "Failed to get transfer status"); } std::cout << "Thread " << thread_id << " Completed Sending Data using UCX backend\n"; diff --git a/test/unit/plugins/gpunetio/meson.build b/test/unit/plugins/gpunetio/meson.build index ddaf928d3..609f489a7 100644 --- a/test/unit/plugins/gpunetio/meson.build +++ b/test/unit/plugins/gpunetio/meson.build @@ -27,7 +27,7 @@ if cuda_dep.found() endif nixl_gpunetio_stream_app = executable ('nixl_gpunetio_stream_test', 'nixl_gpunetio_stream_test.cu', - dependencies: [nixl_dep, nixl_infra, stream_interface] + cuda_dep + nvtx_dep + dl_dep, + dependencies: [nixl_dep, nixl_infra, stream_interface] + cuda_dep + nvtx_dep + dl_dep, nixl_test_utils_dep include_directories: [nixl_inc_dirs, utils_inc_dirs, '../../../../src/utils/serdes'], cpp_args: compile_flags, cuda_args: compile_flags, diff --git a/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu b/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu index 619ae546b..c6083d68a 100644 --- a/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu +++ b/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu @@ -21,7 +21,7 @@ #include #include #include -#include "common/util.h" +#include "test_utils.h" #include "stream/metadata_stream.h" #include "serdes/serdes.h" @@ -300,10 +300,10 @@ main (int argc, char *argv[]) { /** Register memory in both initiator and target */ ret = agent.registerMem (local_vram_rdlist, &extra_params); - CHECK_NIXL_ERROR_AGENT(ret, "Failed to register memory", role); + nixl_exit_on_failure(ret, "Failed to register memory", role); local_vram = local_vram_rdlist.trim(); ret = agent.getLocalMD(metadata); - CHECK_NIXL_ERROR_AGENT(ret, "Failed to get local MD", role); + nixl_exit_on_failure(ret, "Failed to get local MD", role); std::cout << " Start Control Path metadata exchanges \n"; if (role == target) { @@ -316,16 +316,16 @@ main (int argc, char *argv[]) { std::cout << " Received checkRemoteMD from " << initiator << std::endl; data_address_ptr = (uintptr_t)data_address; - CHECK_NIXL_ERROR_AGENT(serdes->addBuf("BaseAddress", &data_address_ptr, sizeof(uintptr_t)), - "Failed to add BaseAddress", - role); - CHECK_NIXL_ERROR_AGENT(serdes->addBuf("BufferSize", &buf_size, sizeof(size_t)), - "Failed to add BufferSize", - role); - CHECK_NIXL_ERROR_AGENT(serdes->addBuf("BufferTransfer", &buf_num, sizeof(uint32_t)), - "Failed to add BufferTransfer", - role); - CHECK_NIXL_ERROR_AGENT(serdes->addStr("AgentMD", metadata), "Failed to add AgentMD", role); + nixl_exit_on_failure(serdes->addBuf("BaseAddress", &data_address_ptr, sizeof(uintptr_t)), + "Failed to add BaseAddress", + role); + nixl_exit_on_failure(serdes->addBuf("BufferSize", &buf_size, sizeof(size_t)), + "Failed to add BufferSize", + role); + nixl_exit_on_failure(serdes->addBuf("BufferTransfer", &buf_num, sizeof(uint32_t)), + "Failed to add BufferTransfer", + role); + nixl_exit_on_failure(serdes->addStr("AgentMD", metadata), "Failed to add AgentMD", role); std::string message = serdes->exportStr(); while (agent.genNotif (initiator, message, &extra_params) != NIXL_SUCCESS) ; @@ -419,9 +419,9 @@ main (int argc, char *argv[]) { md_extra_params.port = peer_port; ret = agent.fetchRemoteMD (target, &md_extra_params); - CHECK_NIXL_ERROR_AGENT(ret, "Failed to fetch remote MD", role); + nixl_exit_on_failure(ret, "Failed to fetch remote MD", role); ret = agent.sendLocalMD (&md_extra_params); - CHECK_NIXL_ERROR_AGENT(ret, "Failed to send local MD", role); + nixl_exit_on_failure(ret, "Failed to send local MD", role); // Not used nixl_xfer_dlist_t descs (DRAM_SEG); std::cout << initiator << " waiting checkRemoteMD from " << target << std::endl; @@ -435,19 +435,19 @@ main (int argc, char *argv[]) { for (const auto ¬if : notifs[target]) { remote_serdes->importStr (notif); - CHECK_NIXL_ERROR_AGENT( + nixl_exit_on_failure( remote_serdes->getBuf("BaseAddress", &data_address_ptr, sizeof(uintptr_t)), "Failed to get BaseAddress", role); - CHECK_NIXL_ERROR_AGENT(remote_serdes->getBuf("BufferSize", &buf_size, sizeof(size_t)), - "Failed to get BufferSize", - role); - CHECK_NIXL_ERROR_AGENT( + nixl_exit_on_failure(remote_serdes->getBuf("BufferSize", &buf_size, sizeof(size_t)), + "Failed to get BufferSize", + role); + nixl_exit_on_failure( remote_serdes->getBuf("BufferTransfer", &buf_num, sizeof(uint32_t)), "Failed to get BufferTransfer", role); remote_metadata = remote_serdes->getStr ("AgentMD"); - CHECK_NIXL_ERROR_AGENT((remote_metadata != ""), "Failed to get AgentMD", role); + nixl_exit_on_failure((remote_metadata != ""), "Failed to get AgentMD", role); agent.loadRemoteMD (remote_metadata, target); } notifs.clear(); @@ -548,16 +548,16 @@ main (int argc, char *argv[]) { std::cout << "Post the request with GPUNETIO backend transfer 2" << std::endl; PUSH_RANGE ("postXferReq", 3) status = agent.postXferReq (treq); - CHECK_NIXL_ERROR_AGENT(status, "Failed to post Xfer Req", role); + nixl_exit_on_failure(status, "Failed to post Xfer Req", role); POP_RANGE std::cout << "Waiting for completion\n"; PUSH_RANGE ("getXferStatus", 4) while (status != NIXL_SUCCESS) { status = agent.getXferStatus (treq); - CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), - "Failed to get Xfer Status", - role); + nixl_exit_on_failure(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), + "Failed to get Xfer Status", + role); } POP_RANGE } else { @@ -570,16 +570,16 @@ main (int argc, char *argv[]) { std::cout << "Post the request with GPUNETIO backend transfer 1" << std::endl; PUSH_RANGE ("postXferReq", 3) status = agent.postXferReq (treq); - CHECK_NIXL_ERROR_AGENT(status, "Failed to post Xfer Req", role); + nixl_exit_on_failure(status, "Failed to post Xfer Req", role); POP_RANGE std::cout << "Waiting for completion\n"; PUSH_RANGE ("getXferStatus", 4) while (status != NIXL_SUCCESS) { status = agent.getXferStatus (treq); - CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), - "Failed to get Xfer Status", - role); + nixl_exit_on_failure(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), + "Failed to get Xfer Status", + role); } POP_RANGE @@ -608,14 +608,14 @@ main (int argc, char *argv[]) { std::cout << "Post the request with GPUNETIO backend transfer 2" << std::endl; PUSH_RANGE ("postXferReq", 3) status = agent.postXferReq (treq); - CHECK_NIXL_ERROR_AGENT(status, "Failed to post Xfer Req", role); + nixl_exit_on_failure(status, "Failed to post Xfer Req", role); POP_RANGE std::cout << "Waiting for completion\n"; PUSH_RANGE ("getXferStatus", 4) while (status != NIXL_SUCCESS) { status = agent.getXferStatus (treq); - CHECK_NIXL_ERROR_AGENT(status, "Failed to get Xfer Status", role); + nixl_exit_on_failure(status, "Failed to get Xfer Status", role); } POP_RANGE } diff --git a/test/unit/plugins/ucx/meson.build b/test/unit/plugins/ucx/meson.build index 27a0f6e5d..a24ea5e8d 100644 --- a/test/unit/plugins/ucx/meson.build +++ b/test/unit/plugins/ucx/meson.build @@ -25,14 +25,14 @@ endif ucx_backend_test = executable('ucx_backend_test', 'ucx_backend_test.cpp', - dependencies: [nixl_dep, nixl_infra, nixl_common_deps, ucx_backend_dep, ucx_dep, thread_dep] + cuda_dependencies, + dependencies: [nixl_dep, nixl_infra, nixl_common_deps, ucx_backend_dep, ucx_dep, thread_dep] + cuda_dependencies + nixl_test_utils_dep, include_directories: [nixl_inc_dirs, utils_inc_dirs, '../../../../src/plugins/ucx'], cpp_args : cpp_args, install: true) ucx_backend_multi = executable('ucx_backend_multi', 'ucx_backend_multi.cpp', - dependencies: [nixl_dep, nixl_infra, nixl_common_deps, ucx_backend_dep, ucx_dep, thread_dep] + cuda_dependencies, + dependencies: [nixl_dep, nixl_infra, nixl_common_deps, ucx_backend_dep, ucx_dep, thread_dep] + cuda_dependencies + nixl_test_utils_dep, include_directories: [nixl_inc_dirs, utils_inc_dirs, '../../../../src/plugins/ucx'], cpp_args : cpp_args, install: true) diff --git a/test/unit/plugins/ucx/ucx_backend_multi.cpp b/test/unit/plugins/ucx/ucx_backend_multi.cpp index 32e387875..08d59cdff 100644 --- a/test/unit/plugins/ucx/ucx_backend_multi.cpp +++ b/test/unit/plugins/ucx/ucx_backend_multi.cpp @@ -18,7 +18,7 @@ #include #include "ucx_backend.h" -#include "common/util.h" +#include "test_utils.h" // Temporarily while fixing CI/CD pipeline @@ -61,13 +61,13 @@ void test_thread(int id) while(!ready[!id]); ret = ucx->loadRemoteConnInfo(other, conn_info[!id]); - CHECK_NIXL_ERROR_AGENT(ret, "Failed to load remote conn info", my_name); + nixl_exit_on_failure((ret == NIXL_SUCCESS), "Failed to load remote conn info", my_name); //one-sided connect if(!id) ret = ucx->connect(other); - CHECK_NIXL_ERROR_AGENT(ret, "Failed to connect", my_name); + nixl_exit_on_failure((ret == NIXL_SUCCESS), "Failed to connect", my_name); done[id] = true; while(!done[!id]) diff --git a/test/unit/plugins/ucx/ucx_backend_test.cpp b/test/unit/plugins/ucx/ucx_backend_test.cpp index 344b86bce..42fe4a9db 100644 --- a/test/unit/plugins/ucx/ucx_backend_test.cpp +++ b/test/unit/plugins/ucx/ucx_backend_test.cpp @@ -19,7 +19,7 @@ #include #include "ucx_backend.h" -#include "common/util.h" +#include "test_utils.h" using namespace std; @@ -64,7 +64,7 @@ class testHndlIterator { ~testHndlIterator() { /* Make sure that handler was released */ - CHECK_NIXL_ERROR(set, "Handler was not released"); + nixl_exit_on_failure(!set, "Handler was not released"); } bool needPrep() { @@ -88,7 +88,7 @@ class testHndlIterator { void setHandle(nixlBackendReqH *_handle) { - CHECK_NIXL_ERROR(set, "Handler was not released"); + nixl_exit_on_failure(!set, "Handler was not released"); handle = _handle; set = true; if (reuse) { @@ -97,12 +97,12 @@ class testHndlIterator { } void unsetHandle() { - CHECK_NIXL_ERROR(!set, "Handler was not set"); + nixl_exit_on_failure(set, "Handler was not set"); set = false; } nixlBackendReqH *&getHandle() { - CHECK_NIXL_ERROR(!set, "Handler was not set"); + nixl_exit_on_failure(set, "Handler was not set"); return handle; } }; @@ -119,7 +119,7 @@ createEngine(std::string name, bool p_thread) { init.type = "UCX"; auto ucx = nixlUcxEngine::create(init).release(); - CHECK_NIXL_ERROR(ucx->getInitErr(), "Failed to initialize worker1"); + nixl_exit_on_failure(!ucx->getInitErr(), "Failed to initialize worker1"); return ucx; } @@ -141,8 +141,9 @@ std::string memType2Str(nixl_mem_t mem_type) case FILE_SEG: return std::string("FILE"); default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(false, "Unsupported memory type!"); } + return std::string(""); } @@ -199,9 +200,9 @@ void allocateBuffer(nixl_mem_t mem_type, int dev_id, size_t len, void* &addr) } #endif default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(false, "Unsupported memory type!"); } - CHECK_NIXL_ERROR((addr == nullptr), "Failed to allocate buffer"); + nixl_exit_on_failure((addr != nullptr), "Failed to allocate buffer"); } void releaseBuffer(nixl_mem_t mem_type, int dev_id, void* &addr) @@ -217,7 +218,7 @@ void releaseBuffer(nixl_mem_t mem_type, int dev_id, void* &addr) break; #endif default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(false, "Unsupported memory type!"); } } @@ -234,7 +235,7 @@ void doMemset(nixl_mem_t mem_type, int dev_id, void *addr, char byte, size_t len break; #endif default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(false, "Unsupported memory type!"); } } @@ -252,8 +253,9 @@ void *getValidationPtr(nixl_mem_t mem_type, void *addr, size_t len) } #endif default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(false, "Unsupported memory type!"); } + return nullptr; } void *releaseValidationPtr(nixl_mem_t mem_type, void *addr) @@ -267,14 +269,14 @@ void *releaseValidationPtr(nixl_mem_t mem_type, void *addr) break; #endif default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(false, "Unsupported memory type!"); } - return NULL; + return nullptr; } void allocateWrongGPUTest(nixlUcxEngine *ucx, int dev_id) { - nixlBlobDesc desc = {0}; + nixlBlobDesc desc; nixlBackendMD* md; void* buf; @@ -285,7 +287,7 @@ allocateWrongGPUTest(nixlUcxEngine *ucx, int dev_id) { int ret = ucx->registerMem(desc, VRAM_SEG, md); - CHECK_NIXL_ERROR_AGENT(ret, "Failed to register memory", "test"); + nixl_exit_on_failure((ret == NIXL_ERR_NOT_SUPPORTED), "Failed to register memory", "test"); releaseBuffer(VRAM_SEG, dev_id, buf); } @@ -307,7 +309,7 @@ allocateAndRegister(nixlUcxEngine *ucx, int ret = ucx->registerMem(desc, mem_type, md); - CHECK_NIXL_ERROR(ret, "Failed to register memory"); + nixl_exit_on_failure((ret == NIXL_SUCCESS), "Failed to allocate and register memory"); } void @@ -335,11 +337,11 @@ loadRemote(nixlUcxEngine *ucx, info.devId = dev_id; ucx->getPublicData(lmd, info.metaInfo); - CHECK_NIXL_ERROR((info.metaInfo.size() == 0), "Failed to get public data"); + nixl_exit_on_failure((info.metaInfo.size() > 0), "Failed to get public data"); // We get the data from the cetnral location and populate the backend, and receive remote_meta int ret = ucx->loadRemoteMD(info, mem_type, agent, rmd); - CHECK_NIXL_ERROR(ret, "Failed to load remote MD"); + nixl_exit_on_failure((ret == NIXL_SUCCESS), "Failed to load remote MD"); } void populateDescs(nixl_meta_dlist_t &descs, int dev_id, void *addr, int desc_cnt, size_t desc_size, nixlBackendMD* &md) @@ -403,12 +405,12 @@ performTransfer(nixlUcxEngine *ucx1, nixlBackendReqH *new_handle = nullptr; ret3 = ucx1->prepXfer(op, req_src_descs, req_dst_descs, remote_agent, new_handle, &opt_args); - CHECK_NIXL_ERROR(ret3, "Failed to prep xfer"); + nixl_exit_on_failure(ret3, "Failed to prep xfer"); hiter.setHandle(new_handle); } nixlBackendReqH *&handle = hiter.getHandle(); ret3 = ucx1->postXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); - CHECK_NIXL_ERROR(!((ret3 == NIXL_SUCCESS) || (ret3 == NIXL_IN_PROG)), "Failed to post xfer"); + nixl_exit_on_failure(ret3 >= NIXL_SUCCESS, "Failed to post xfer"); if (ret3 == NIXL_SUCCESS) { cout << "\t\tWARNING: Tansfer request completed immediately - no testing non-inline path" << endl; @@ -420,8 +422,7 @@ performTransfer(nixlUcxEngine *ucx1, if(progress){ ucx2->progress(); } - CHECK_NIXL_ERROR(!((ret3 == NIXL_SUCCESS) || (ret3 == NIXL_IN_PROG)), - "Failed to check xfer"); + nixl_exit_on_failure(ret3 >= NIXL_SUCCESS, "Failed to check xfer"); } } @@ -443,13 +444,15 @@ performTransfer(nixlUcxEngine *ucx1, if(progress){ ucx1->progress(); } - CHECK_NIXL_ERROR(ret3, "Failed to get notifs"); + nixl_exit_on_failure(ret3, "Failed to get notifs"); } - CHECK_NIXL_ERROR((ret2 != 1), "Incorrect number of target notifs"); - CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); - CHECK_NIXL_ERROR((target_notifs.front().second != test_str), - "Incorrect front notif message"); + nixl_exit_on_failure((ret2 == 1), "Incorrect number of target notifs"); + + nixl_exit_on_failure((target_notifs.front().first == "Agent1"), + "Incorrect front notif source"); + nixl_exit_on_failure((target_notifs.front().second == test_str), + "Incorrect front notif message"); cout << "OK" << endl; } @@ -461,7 +464,7 @@ performTransfer(nixlUcxEngine *ucx1, // Perform correctness check. for (size_t i = 0; i < len; i++) { - CHECK_NIXL_ERROR((((uint8_t *)chkptr1)[i] != ((uint8_t *)chkptr2)[i]), "Data mismatch"); + nixl_exit_on_failure((((uint8_t *)chkptr1)[i] == ((uint8_t *)chkptr2)[i]), "Data mismatch"); } releaseValidationPtr(req_src_descs.getType(), chkptr1); @@ -486,14 +489,14 @@ test_intra_agent_transfer(bool p_thread, nixlUcxEngine *ucx, nixl_mem_t mem_type int iter = 10; - CHECK_NIXL_ERROR(!ucx->supportsLocal(), "Failed to get conn info"); + nixl_exit_on_failure(ucx->supportsLocal(), "Failed to get conn info"); //connection info is still a string std::string conn_info1; ret1 = ucx->getConnInfo(conn_info1); - CHECK_NIXL_ERROR(ret1, "Failed to get conn info"); + nixl_exit_on_failure((ret1 == NIXL_SUCCESS), "Failed to get conn info"); ret1 = ucx->loadRemoteConnInfo(agent1, conn_info1); - CHECK_NIXL_ERROR(ret1, "Failed to load remote conn info"); + nixl_exit_on_failure((ret1 == NIXL_SUCCESS), "Failed to load remote conn info"); std::cout << "Local connection complete\n"; @@ -511,7 +514,7 @@ test_intra_agent_transfer(bool p_thread, nixlUcxEngine *ucx, nixl_mem_t mem_type //string descs unnecessary, convert meta locally nixlBackendMD* rmd2; ret1 = ucx->loadLocalMD(lmd2, rmd2); - CHECK_NIXL_ERROR(ret1, "Failed to load local MD"); + nixl_exit_on_failure((ret1 == NIXL_SUCCESS), "Failed to load local MD"); nixl_meta_dlist_t req_src_descs (mem_type); populateDescs(req_src_descs, 0, addr1, desc_cnt, desc_size, lmd1); @@ -576,13 +579,13 @@ test_inter_agent_transfer(bool p_thread, // location and ask for it for a remote node std::string conn_info1, conn_info2; ret = ucx1->getConnInfo(conn_info1); - CHECK_NIXL_ERROR(ret, "Failed to get conn info"); + nixl_exit_on_failure((ret == NIXL_SUCCESS), "Failed to get conn info"); ret = ucx2->getConnInfo(conn_info2); - CHECK_NIXL_ERROR(ret, "Failed to get conn info"); + nixl_exit_on_failure((ret == NIXL_SUCCESS), "Failed to get conn info"); // We assumed we put them to central location and now receiving it on the other process ret = ucx1->loadRemoteConnInfo(agent2, conn_info2); - CHECK_NIXL_ERROR(ret, "Failed to load remote conn info"); + nixl_exit_on_failure((ret == NIXL_SUCCESS), "Failed to load remote conn info"); // TODO: Causes race condition - investigate conn management implementation // ret = ucx2->loadRemoteConnInfo (agent1, conn_info1); @@ -653,13 +656,14 @@ test_inter_agent_transfer(bool p_thread, while(ret == 0){ ret2 = ucx2->getNotifs(target_notifs); ret = target_notifs.size(); - CHECK_NIXL_ERROR(ret2, "Failed to get notifs"); + nixl_exit_on_failure((ret2 == NIXL_SUCCESS), "Failed to get notifs"); } - CHECK_NIXL_ERROR((ret != 1), "Incorrect number of target notifs"); - CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); - CHECK_NIXL_ERROR((target_notifs.front().second != test_str), - "Incorrect front notif message"); + nixl_exit_on_failure((ret == 1), "Incorrect number of target notifs"); + nixl_exit_on_failure((target_notifs.front().first == "Agent1"), + "Incorrect front notif source"); + nixl_exit_on_failure((target_notifs.front().second == test_str), + "Incorrect front notif message"); cout << "OK" << endl; } diff --git a/test/unit/plugins/ucx_mo/meson.build b/test/unit/plugins/ucx_mo/meson.build index 9d23daaad..207c4e69a 100644 --- a/test/unit/plugins/ucx_mo/meson.build +++ b/test/unit/plugins/ucx_mo/meson.build @@ -26,7 +26,7 @@ endif ucx_backend_test = executable('ucx_mo_backend_test', 'ucx_mo_backend_test.cpp', - dependencies: [nixl_dep, nixl_infra, nixl_common_deps, ucx_backend_dep, ucx_mo_backend_dep, ucx_dep] + cuda_dependencies, + dependencies: [nixl_dep, nixl_infra, nixl_common_deps, ucx_backend_dep, ucx_mo_backend_dep, ucx_dep] + cuda_dependencies + nixl_test_utils_dep, include_directories: [nixl_inc_dirs, utils_inc_dirs], cpp_args : cpp_args, install: true) diff --git a/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp b/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp index 5ec943aab..d2098251d 100644 --- a/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp +++ b/test/unit/plugins/ucx_mo/ucx_mo_backend_test.cpp @@ -19,7 +19,7 @@ #include #include "ucx_mo_backend.h" -#include "common/util.h" +#include "test_utils.h" using namespace std; @@ -33,9 +33,9 @@ int gpu_id = 0; static void checkCudaError(cudaError_t result, const char *message) { if (result != cudaSuccess) { - std::cerr << message << " (Error code: " << result << " - " - << cudaGetErrorString(result) << ")" << std::endl; - exit(EXIT_FAILURE); + nixl_exit_on_failure(result, + std::string(message) + " (Error code: " + std::to_string(result) + + " - " + cudaGetErrorString(result) + ")"); } } #endif @@ -67,8 +67,9 @@ std::string memType2Str(nixl_mem_t mem_type) case FILE_SEG: return std::string("FILE"); default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(false, "Unsupported memory type!"); } + return std::string(""); } nixlBackendEngine *createEngine(std::string name, uint32_t ndev, bool p_thread) @@ -85,7 +86,7 @@ nixlBackendEngine *createEngine(std::string name, uint32_t ndev, bool p_thread) init.type = "UCX_MO"; ucx_mo = (nixlBackendEngine *)new nixlUcxMoEngine(&init); - CHECK_NIXL_ERROR(ucx_mo->getInitErr(), "Failed to initialize worker1"); + nixl_exit_on_failure(!ucx_mo->getInitErr(), "Failed to initialize worker1"); if (ucx_mo->getInitErr()) { std::cout << "Failed to initialize worker1" << std::endl; exit(1); @@ -138,7 +139,7 @@ void allocateBuffer(nixl_mem_t mem_type, int dev_id, size_t len, void* &addr) case DRAM_SEG: //addr = calloc(1, len); ret = posix_memalign(&addr, 4096, len); - CHECK_NIXL_ERROR(ret, "Failed to allocate mem aligned buffer"); + nixl_exit_on_failure((ret == 0), "Failed to allocate mem aligned buffer"); break; #ifdef HAVE_CUDA case VRAM_SEG: { @@ -155,9 +156,9 @@ void allocateBuffer(nixl_mem_t mem_type, int dev_id, size_t len, void* &addr) } #endif default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(false, "Unsupported memory type!"); } - CHECK_NIXL_ERROR((addr == nullptr), "Failed to allocate buffer"); + nixl_exit_on_failure(addr != nullptr, "Failed to allocate buffer"); } void releaseBuffer(nixl_mem_t mem_type, int dev_id, void* &addr) @@ -173,7 +174,7 @@ void releaseBuffer(nixl_mem_t mem_type, int dev_id, void* &addr) break; #endif default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(false, "Unsupported memory type!"); } } @@ -190,7 +191,7 @@ void doMemset(nixl_mem_t mem_type, int dev_id, void *addr, char byte, size_t len break; #endif default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(1, "Unsupported memory type!"); } } @@ -208,8 +209,9 @@ void *getValidationPtr(nixl_mem_t mem_type, void *addr, size_t len) } #endif default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(false, "Unsupported memory type!"); } + return nullptr; } void *releaseValidationPtr(nixl_mem_t mem_type, void *addr) @@ -223,9 +225,9 @@ void *releaseValidationPtr(nixl_mem_t mem_type, void *addr) break; #endif default: - CHECK_NIXL_ERROR(1, "Unsupported memory type!"); + nixl_exit_on_failure(false, "Unsupported memory type!"); } - return NULL; + return nullptr; } typedef int dev_distr_t(int idx, int max_idx, int cnt); @@ -266,7 +268,7 @@ void createLocalDescs(nixlBackendEngine *ucx, nixl_meta_dlist_t &descs, *((nixlBasicDesc*)&desc_s) = desc; *((nixlBasicDesc*)&desc_m) = desc; int ret = ucx->registerMem(desc_s, descs.getType(), desc_m.metadataP); - CHECK_NIXL_ERROR(ret, "Failed to register ucx memory"); + nixl_exit_on_failure((ret == NIXL_SUCCESS), "Failed to register ucx memory"); descs.addDesc(desc_m); } } @@ -306,11 +308,11 @@ void createRemoteDescs(nixlBackendEngine *src_ucx, status = dst_ucx->loadLocalMD(src_descs[i].metadataP, desc_m.metadataP); } else { status = src_ucx->getPublicData(src_descs[i].metadataP, desc_s.metaInfo); - CHECK_NIXL_ERROR(status, "Failed to get src_ucx public data"); + nixl_exit_on_failure(status, "Failed to get src_ucx public data"); status = dst_ucx->loadRemoteMD (desc_s, src_descs.getType(), agent, desc_m.metadataP); } - CHECK_NIXL_ERROR(status, "Failed to load dst_ucx remote MD"); + nixl_exit_on_failure(status, "Failed to load dst_ucx remote MD"); dst_descs.addDesc(desc_m); } } @@ -321,7 +323,7 @@ void destroyRemoteDescs(nixlBackendEngine *dst_ucx, nixl_status_t status; for(int i = 0; i < dst_descs.descCount(); i++) { status = dst_ucx->unloadMD(dst_descs[i].metadataP); - CHECK_NIXL_ERROR(status, "Failed to unload dst_ucx MD"); + nixl_exit_on_failure(status, "Failed to unload dst_ucx MD"); } while(dst_descs.descCount()) { @@ -352,9 +354,9 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, // or an ID that later can be used to check the status as a new method // Also maybe we would remove the WRITE and let the backend class decide the op status = ucx1->prepXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); - CHECK_NIXL_ERROR(status, "Failed to prep ucx1 xfer"); + nixl_exit_on_failure(status, "Failed to prep ucx1 xfer"); status = ucx1->postXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args); - CHECK_NIXL_ERROR((status > NIXL_IN_PROG), "Failed to post ucx1 xfer"); + nixl_exit_on_failure((status >= NIXL_SUCCESS), "Failed to post ucx1 xfer"); if (status == NIXL_SUCCESS) { @@ -367,8 +369,7 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, if(progress){ ((nixlUcxMoEngine *)ucx2)->progress(); } - CHECK_NIXL_ERROR(!((NIXL_SUCCESS == status) || (NIXL_IN_PROG == status)), - "Failed to check ucx1 xfer"); + nixl_exit_on_failure((status >= NIXL_SUCCESS), "Failed to check ucx1 xfer"); } ucx1->releaseReqH(handle); } @@ -381,35 +382,37 @@ void performTransfer(nixlBackendEngine *ucx1, nixlBackendEngine *ucx2, while(!target_notifs.size()){ status = ucx2->getNotifs(target_notifs); - CHECK_NIXL_ERROR(status, "Failed to get ucx2 notifs"); + nixl_exit_on_failure(status, "Failed to get ucx2 notifs"); if(progress){ ((nixlUcxMoEngine *)ucx1)->progress(); } } - CHECK_NIXL_ERROR((target_notifs.size() != 1), "Incorrect number of target notifs"); - CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); - CHECK_NIXL_ERROR((target_notifs.front().second != test_str), - "Incorrect front notif message"); + nixl_exit_on_failure((target_notifs.size() == 1), "Incorrect number of target notifs"); + nixl_exit_on_failure((target_notifs.front().first == "Agent1"), + "Incorrect front notif source"); + nixl_exit_on_failure((target_notifs.front().second == test_str), + "Incorrect front notif message"); cout << "OK" << endl; } cout << "\t\tData verification: " << flush; - CHECK_NIXL_ERROR((req_src_descs.descCount() != req_dst_descs.descCount()), - "Data length mismatch"); + nixl_exit_on_failure((req_src_descs.descCount() == req_dst_descs.descCount()), + "Data length mismatch"); for(int i = 0; i < req_src_descs.descCount(); i++) { auto sdesc = req_src_descs[i]; auto ddesc = req_dst_descs[i]; - CHECK_NIXL_ERROR((sdesc.len != ddesc.len), "Data length mismatch"); + nixl_exit_on_failure((sdesc.len == ddesc.len), "Data length mismatch"); size_t len = ddesc.len; chkptr1 = getValidationPtr(req_src_descs.getType(), (void*)sdesc.addr, len); chkptr2 = getValidationPtr(req_dst_descs.getType(), (void*)ddesc.addr, len); // Perform correctness check. for (size_t i = 0; i < len; i++) { - CHECK_NIXL_ERROR((((uint8_t *)chkptr1)[i] != ((uint8_t *)chkptr2)[i]), "Data mismatch"); + nixl_exit_on_failure((((uint8_t *)chkptr1)[i] == ((uint8_t *)chkptr2)[i]), + "Data mismatch"); } releaseValidationPtr(req_src_descs.getType(), chkptr1); @@ -450,16 +453,16 @@ void test_agent_transfer(bool p_thread, // location and ask for it for a remote node std::string conn_info1; status = ucx1->getConnInfo(conn_info1); - CHECK_NIXL_ERROR(status, "Failed to get ucx1 conn info"); + nixl_exit_on_failure(status, "Failed to get ucx1 conn info"); std::string conn_info2; status = ucx2->getConnInfo(conn_info2); - CHECK_NIXL_ERROR(status, "Failed to get ucx2 conn info"); + nixl_exit_on_failure(status, "Failed to get ucx2 conn info"); // We assumed we put them to central location and now receiving it on the other process if (is_local) { agent = &agent1; } status = ucx1->loadRemoteConnInfo(*agent, conn_info2); - CHECK_NIXL_ERROR(status, "Failed to load ucx1 remote conn info"); + nixl_exit_on_failure(status, "Failed to load ucx1 remote conn info"); // TODO: Causes race condition - investigate conn management implementation // ret = ucx2->loadRemoteConnInfo (agent1, conn_info1); @@ -520,17 +523,18 @@ void test_agent_transfer(bool p_thread, while(target_notifs.size() == 0){ status = ucx2->getNotifs(target_notifs); - CHECK_NIXL_ERROR(status, "Failed to get ucx2 notifs"); + nixl_exit_on_failure(status, "Failed to get ucx2 notifs"); if (!p_thread) { /* progress UCX1 as well */ ((nixlUcxMoEngine *)ucx1)->progress(); } } - CHECK_NIXL_ERROR((target_notifs.size() != 1), "Incorrect number of target notifs"); - CHECK_NIXL_ERROR((target_notifs.front().first != "Agent1"), "Incorrect front notif source"); - CHECK_NIXL_ERROR((target_notifs.front().second != test_str), - "Incorrect front notif message"); + nixl_exit_on_failure((target_notifs.size() == 1), "Incorrect number of target notifs"); + nixl_exit_on_failure((target_notifs.front().first == "Agent1"), + "Incorrect front notif source"); + nixl_exit_on_failure((target_notifs.front().second == test_str), + "Incorrect front notif message"); cout << "OK" << endl; } diff --git a/test/unit/utils/common/map_perf.cpp b/test/unit/utils/common/map_perf.cpp index 308dd5473..964e7d00e 100644 --- a/test/unit/utils/common/map_perf.cpp +++ b/test/unit/utils/common/map_perf.cpp @@ -23,7 +23,7 @@ #include #include "common/str_tools.h" -#include "common/util.h" +#include "test_utils.h" std::string generate_random_string(size_t length) { const std::string characters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; @@ -118,7 +118,7 @@ void test_comparison_perf(const int n_entries, const size_t str_len) { std::cout << "custom map lookup test, total time for " << n_iters << " iters: " << diff_time.tv_sec << "s " << diff_time.tv_usec << "us \n"; - CHECK_NIXL_ERROR_AGENT((sum1 != sum2), "Test failed", "test"); + nixl_exit_on_failure((sum1 == sum2), "Test failed", "test"); gettimeofday(&start_time, NULL); for(int i = 0; i Date: Thu, 9 Oct 2025 14:54:32 -0700 Subject: [PATCH 15/17] Fix test for release build Signed-off-by: Adit Ranadive --- test/unit/plugins/ucx/ucx_backend_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/plugins/ucx/ucx_backend_test.cpp b/test/unit/plugins/ucx/ucx_backend_test.cpp index 42fe4a9db..30b207c0b 100644 --- a/test/unit/plugins/ucx/ucx_backend_test.cpp +++ b/test/unit/plugins/ucx/ucx_backend_test.cpp @@ -276,7 +276,7 @@ void *releaseValidationPtr(nixl_mem_t mem_type, void *addr) void allocateWrongGPUTest(nixlUcxEngine *ucx, int dev_id) { - nixlBlobDesc desc; + nixlBlobDesc desc = {0}; nixlBackendMD* md; void* buf; From 6948a9a7f01f18b3db8eae6d087767cfb3b9cba1 Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Thu, 9 Oct 2025 16:06:06 -0700 Subject: [PATCH 16/17] Fix up gpunetio test Signed-off-by: Adit Ranadive --- test/unit/plugins/gpunetio/meson.build | 2 +- test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/test/unit/plugins/gpunetio/meson.build b/test/unit/plugins/gpunetio/meson.build index 609f489a7..960799f8b 100644 --- a/test/unit/plugins/gpunetio/meson.build +++ b/test/unit/plugins/gpunetio/meson.build @@ -27,7 +27,7 @@ if cuda_dep.found() endif nixl_gpunetio_stream_app = executable ('nixl_gpunetio_stream_test', 'nixl_gpunetio_stream_test.cu', - dependencies: [nixl_dep, nixl_infra, stream_interface] + cuda_dep + nvtx_dep + dl_dep, nixl_test_utils_dep + dependencies: [nixl_dep, nixl_infra, stream_interface] + cuda_dep + nvtx_dep + dl_dep + nixl_test_utils_dep, include_directories: [nixl_inc_dirs, utils_inc_dirs, '../../../../src/utils/serdes'], cpp_args: compile_flags, cuda_args: compile_flags, diff --git a/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu b/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu index c6083d68a..fc8cc963a 100644 --- a/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu +++ b/test/unit/plugins/gpunetio/nixl_gpunetio_stream_test.cu @@ -507,9 +507,7 @@ main (int argc, char *argv[]) { std::cout << "Post the request with GPUNETIO backend transfer 1" << std::endl; PUSH_RANGE ("postXferReq", 3) status = agent.postXferReq (treq); - CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), - "Failed to post Xfer Req", - role); + nixl_exit_on_failure((status < 0), "Failed to post Xfer Req", role); POP_RANGE @@ -517,9 +515,7 @@ main (int argc, char *argv[]) { PUSH_RANGE ("getXferStatus", 4) while (status != NIXL_SUCCESS) { status = agent.getXferStatus (treq); - CHECK_NIXL_ERROR_AGENT(!(status == NIXL_SUCCESS || status == NIXL_IN_PROG), - "Failed to get Xfer Status", - role); + nixl_exit_on_failure(status < 0, "Failed to get Xfer Status", role); } POP_RANGE // No need for cudaStreamSyncronize as CUDA kernel and Xfer are on the same stream From f4a55645ecce323c1a69480d01295a27c5f43017 Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Thu, 9 Oct 2025 22:16:02 -0700 Subject: [PATCH 17/17] Fix up for two tests Signed-off-by: Adit Ranadive --- test/nixl/agent_example.cpp | 3 ++- test/nixl/nixl_test.cpp | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/nixl/agent_example.cpp b/test/nixl/agent_example.cpp index a64e715eb..f02450533 100644 --- a/test/nixl/agent_example.cpp +++ b/test/nixl/agent_example.cpp @@ -276,7 +276,8 @@ nixl_status_t partialMdTest(nixlAgent* A1, nixlAgent* A2, nixlBackendH* backend1 nixlDlistH *dst_side; status = A1->prepXferDlist(agent2, dst_mem_lists[update].trim(), dst_side, &extra_params1); nixl_exit_on_failure(status, "Failed to prep xfer dlist", agent1); - nixl_exit_on_failure((dst_side == nullptr), "Dst side is null", agent1); + nixl_exit_on_failure((dst_side != nullptr), "Dst side is null", agent1); + // Make sure not-loaded descriptors are not updated for (int invalid_idx = update + 1; invalid_idx < NUM_UPDATES; invalid_idx++) { status = A1->prepXferDlist(agent2, dst_mem_lists[invalid_idx].trim(), dst_side, &extra_params1); diff --git a/test/nixl/nixl_test.cpp b/test/nixl/nixl_test.cpp index 075781ff0..9880949a1 100644 --- a/test/nixl/nixl_test.cpp +++ b/test/nixl/nixl_test.cpp @@ -119,6 +119,7 @@ static void targetThread(nixlAgent &agent, nixl_opt_args_t *extra_params, int th static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params, const std::string &target_ip, int target_port, int thread_id, SharedNotificationState &shared_state) { + nixl_status_t st; nixl_reg_dlist_t dram_for_ucx(DRAM_SEG); auto addrs = initMem(agent, dram_for_ucx, extra_params, MEM_VAL); @@ -129,9 +130,11 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params, md_extra_params.ipAddr = target_ip; md_extra_params.port = target_port; - agent.fetchRemoteMD(target, &md_extra_params); + st = agent.fetchRemoteMD(target, &md_extra_params); + nixl_exit_on_failure(st, "Failed to fetch remote MD"); - agent.sendLocalMD(&md_extra_params); + st = agent.sendLocalMD(&md_extra_params); + nixl_exit_on_failure(st, "Failed to send local MD"); // Wait for notifications and populate shared state while (true) {