From 871e13413fb0295bb49df6af0e9f5fefeab09093 Mon Sep 17 00:00:00 2001 From: Tim <43156029+AtlantaPepsi@users.noreply.github.com> Date: Wed, 30 Oct 2024 15:13:53 -0400 Subject: [PATCH] Adjustment for UT Sendrecv (#1400) Enabled UT sendrecv to same rank and refactor UBR call --- test/SendRecvTests.cpp | 127 +++++++++++++++++++++-------------- test/common/TestBedChild.cpp | 12 ++-- 2 files changed, 83 insertions(+), 56 deletions(-) diff --git a/test/SendRecvTests.cpp b/test/SendRecvTests.cpp index 84dd73b01..da152c0c0 100644 --- a/test/SendRecvTests.cpp +++ b/test/SendRecvTests.cpp @@ -16,7 +16,6 @@ namespace RcclUnitTesting std::vector const numElements = {1048576, 53327, 1024, 0}; bool const inPlace = false; bool const useManagedMem = false; - int const groupCallId = 0; OptionalColArgs options; bool isCorrect = true; @@ -28,7 +27,10 @@ namespace RcclUnitTesting int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu; int totalRanks = numGpus * ranksPerGpu; int const numProcesses = isMultiProcess ? numGpus : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), + {1,2}, //two group, second group sendrecv to self, has 2 coll + testBed.GetNumStreamsPerGroup(1,2), + 2); for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx) @@ -37,6 +39,8 @@ namespace RcclUnitTesting for (int recvRank = 0; recvRank < totalRanks; ++recvRank) { options.root = recvRank; + int groupCallId = sendRank == recvRank; //self sendrecv group has two coll + int recvId = sendRank == recvRank; //where recv will be second coll testBed.SetCollectiveArgs(ncclCollSend, dataTypes[dataIdx], numElements[numIdx], @@ -47,36 +51,46 @@ namespace RcclUnitTesting sendRank); if (recvRank == 0) { - testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank); - testBed.PrepareData(groupCallId, 0, sendRank); - } - if (recvRank != sendRank) - { - if (testBed.ev.showNames) // Show test names - INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", - isMultiProcess ? "MP" : "SP", - ncclDataTypeNames[dataTypes[dataIdx]], - sendRank, - recvRank, - numElements[numIdx]); - - options.root = sendRank; - testBed.SetCollectiveArgs(ncclCollRecv, + //set up the collArg slot to make sure AllocateMem is called once and correctly + testBed.SetCollectiveArgs(ncclCollSend, dataTypes[dataIdx], numElements[numIdx], numElements[numIdx], options, 0, - groupCallId, - recvRank); - testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank); - testBed.PrepareData(groupCallId, 0, recvRank); - testBed.ExecuteCollectives({sendRank, recvRank}); - testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank); - testBed.DeallocateMem(groupCallId, 0, recvRank); + !groupCallId, + sendRank); + testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank); + testBed.PrepareData(0, 0, sendRank); + testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank); + testBed.PrepareData(1, 0, sendRank); } + + if (testBed.ev.showNames) // Show test names + INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", + isMultiProcess ? "MP" : "SP", + ncclDataTypeNames[dataTypes[dataIdx]], + sendRank, + recvRank, + numElements[numIdx]); + options.root = sendRank; + + testBed.SetCollectiveArgs(ncclCollRecv, + dataTypes[dataIdx], + numElements[numIdx], + numElements[numIdx], + options, + recvId, + groupCallId, + recvRank); + testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank); + testBed.PrepareData(groupCallId, recvId, recvRank); + testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId); + testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank); + testBed.DeallocateMem(groupCallId, recvId, recvRank); } - testBed.DeallocateMem(groupCallId, 0, sendRank); + testBed.DeallocateMem(0, 0, sendRank); + testBed.DeallocateMem(1, 0, sendRank); } testBed.DestroyComms(); } @@ -94,7 +108,6 @@ namespace RcclUnitTesting bool const inPlace = false; bool const useManagedMem = false; bool const userRegistered = true; - int const groupCallId = 0; OptionalColArgs options; bool isCorrect = true; @@ -106,7 +119,10 @@ namespace RcclUnitTesting int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu; int totalRanks = numGpus * ranksPerGpu; int const numProcesses = isMultiProcess ? numGpus : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), + {1,2}, //two group, second group sendrecv to self, has 2 coll + testBed.GetNumStreamsPerGroup(1,2), + 2); for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx) @@ -115,6 +131,8 @@ namespace RcclUnitTesting for (int recvRank = 0; recvRank < totalRanks; ++recvRank) { options.root = recvRank; + int groupCallId = sendRank == recvRank; + int recvId = sendRank == recvRank; testBed.SetCollectiveArgs(ncclCollSend, dataTypes[dataIdx], numElements[numIdx], @@ -125,36 +143,45 @@ namespace RcclUnitTesting sendRank); if (recvRank == 0) { - testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank, userRegistered); - testBed.PrepareData(groupCallId, 0, sendRank); - } - if (recvRank != sendRank) - { - if (testBed.ev.showNames) // Show test names - INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", - isMultiProcess ? "MP" : "SP", - ncclDataTypeNames[dataTypes[dataIdx]], - sendRank, - recvRank, - numElements[numIdx]); - - options.root = sendRank; - testBed.SetCollectiveArgs(ncclCollRecv, + testBed.SetCollectiveArgs(ncclCollSend, dataTypes[dataIdx], numElements[numIdx], numElements[numIdx], options, 0, - groupCallId, - recvRank); - testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank, userRegistered); - testBed.PrepareData(groupCallId, 0, recvRank); - testBed.ExecuteCollectives({sendRank, recvRank}); - testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank); - testBed.DeallocateMem(groupCallId, 0, recvRank); + !groupCallId, + sendRank); + testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank, userRegistered); + testBed.PrepareData(0, 0, sendRank); + testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank, userRegistered); + testBed.PrepareData(1, 0, sendRank); } + + if (testBed.ev.showNames) // Show test names + INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", + isMultiProcess ? "MP" : "SP", + ncclDataTypeNames[dataTypes[dataIdx]], + sendRank, + recvRank, + numElements[numIdx]); + + options.root = sendRank; + testBed.SetCollectiveArgs(ncclCollRecv, + dataTypes[dataIdx], + numElements[numIdx], + numElements[numIdx], + options, + recvId, + groupCallId, + recvRank); + testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank, userRegistered); + testBed.PrepareData(groupCallId, recvId, recvRank); + testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId); + testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank); + testBed.DeallocateMem(groupCallId, recvId, recvRank); } - testBed.DeallocateMem(groupCallId, 0, sendRank); + testBed.DeallocateMem(0, 0, sendRank); + testBed.DeallocateMem(1, 0, sendRank); } testBed.DestroyComms(); } diff --git a/test/common/TestBedChild.cpp b/test/common/TestBedChild.cpp index 48554537a..c9b409e6d 100644 --- a/test/common/TestBedChild.cpp +++ b/test/common/TestBedChild.cpp @@ -395,6 +395,8 @@ namespace RcclUnitTesting { CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx]; CHECK_CALL(collArg.AllocateMem(inPlace, useManagedMem, userRegistered)); + if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv)) + CHILD_NCCL_CALL(ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister"); if (this->verbose) INFO("Rank %d on child %d allocates memory for collective %d in group %d on device %d (%s,%s,%s) Input: %p Output %p\n", globalRank, this->childId, collIdx, groupId, this->deviceIds[localRank], inPlace ? "in-place" : "out-of-place", @@ -646,8 +648,6 @@ namespace RcclUnitTesting "ncclAllToAllv"); break; case ncclCollSend: - if (collArg.userRegistered) - CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister"); CHILD_NCCL_CALL_RANK(errCode, ncclSend( collArg.inputGpu.ptr, collArg.numInputElements, @@ -658,8 +658,6 @@ namespace RcclUnitTesting "ncclSend"); break; case ncclCollRecv: - if (collArg.userRegistered) - CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.outputGpu.ptr, collArg.numOutputBytesAllocated, &(collArg.commRegHandle)), "ncclCommRegister"); CHILD_NCCL_CALL_RANK(errCode, ncclRecv( collArg.outputGpu.ptr, collArg.numOutputElements, @@ -891,8 +889,6 @@ namespace RcclUnitTesting for (int collIdx = 0; collIdx < collArgs[groupId][localRank].size(); ++collIdx) { CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx]; - if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv)) - CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister"); if (collId == -1 || collId == collIdx) { if (this->verbose) @@ -900,6 +896,10 @@ namespace RcclUnitTesting INFO("Child %d release memory for collective %d in group %d (Input: %p Output %p\n", this->childId, collIdx, groupId, collArg.inputGpu.ptr, collArg.outputGpu.ptr); } + if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv)) + { + CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister"); + } CHECK_CALL(collArg.DeallocateMem()); }