Skip to content

Commit

Permalink
Merge pull request #1456 from corey-derochie-amd/CP_6.3_SWDEV-479834
Browse files Browse the repository at this point in the history
Cherry-pick (Adjustment for UT Sendrecv (#1400))
  • Loading branch information
vamovsik authored Dec 23, 2024
2 parents eb33738 + 871e134 commit 9a0e6a1
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 56 deletions.
127 changes: 77 additions & 50 deletions test/SendRecvTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ namespace RcclUnitTesting
std::vector<int> const numElements = {1048576, 53327, 1024, 0};
bool const inPlace = false;
bool const useManagedMem = false;
int const groupCallId = 0;

OptionalColArgs options;
bool isCorrect = true;
Expand All @@ -28,7 +27,10 @@ namespace RcclUnitTesting
int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
int totalRanks = numGpus * ranksPerGpu;
int const numProcesses = isMultiProcess ? numGpus : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu),
{1,2}, //two group, second group sendrecv to self, has 2 coll
testBed.GetNumStreamsPerGroup(1,2),
2);

for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
Expand All @@ -37,6 +39,8 @@ namespace RcclUnitTesting
for (int recvRank = 0; recvRank < totalRanks; ++recvRank)
{
options.root = recvRank;
int groupCallId = sendRank == recvRank; //self sendrecv group has two coll
int recvId = sendRank == recvRank; //where recv will be second coll
testBed.SetCollectiveArgs(ncclCollSend,
dataTypes[dataIdx],
numElements[numIdx],
Expand All @@ -47,36 +51,46 @@ namespace RcclUnitTesting
sendRank);
if (recvRank == 0)
{
testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank);
testBed.PrepareData(groupCallId, 0, sendRank);
}
if (recvRank != sendRank)
{
if (testBed.ev.showNames) // Show test names
INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
isMultiProcess ? "MP" : "SP",
ncclDataTypeNames[dataTypes[dataIdx]],
sendRank,
recvRank,
numElements[numIdx]);

options.root = sendRank;
testBed.SetCollectiveArgs(ncclCollRecv,
//set up the collArg slot to make sure AllocateMem is called once and correctly
testBed.SetCollectiveArgs(ncclCollSend,
dataTypes[dataIdx],
numElements[numIdx],
numElements[numIdx],
options,
0,
groupCallId,
recvRank);
testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank);
testBed.PrepareData(groupCallId, 0, recvRank);
testBed.ExecuteCollectives({sendRank, recvRank});
testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank);
testBed.DeallocateMem(groupCallId, 0, recvRank);
!groupCallId,
sendRank);
testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank);
testBed.PrepareData(0, 0, sendRank);
testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank);
testBed.PrepareData(1, 0, sendRank);
}

if (testBed.ev.showNames) // Show test names
INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
isMultiProcess ? "MP" : "SP",
ncclDataTypeNames[dataTypes[dataIdx]],
sendRank,
recvRank,
numElements[numIdx]);
options.root = sendRank;

testBed.SetCollectiveArgs(ncclCollRecv,
dataTypes[dataIdx],
numElements[numIdx],
numElements[numIdx],
options,
recvId,
groupCallId,
recvRank);
testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank);
testBed.PrepareData(groupCallId, recvId, recvRank);
testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId);
testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank);
testBed.DeallocateMem(groupCallId, recvId, recvRank);
}
testBed.DeallocateMem(groupCallId, 0, sendRank);
testBed.DeallocateMem(0, 0, sendRank);
testBed.DeallocateMem(1, 0, sendRank);
}
testBed.DestroyComms();
}
Expand All @@ -94,7 +108,6 @@ namespace RcclUnitTesting
bool const inPlace = false;
bool const useManagedMem = false;
bool const userRegistered = true;
int const groupCallId = 0;

OptionalColArgs options;
bool isCorrect = true;
Expand All @@ -106,7 +119,10 @@ namespace RcclUnitTesting
int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
int totalRanks = numGpus * ranksPerGpu;
int const numProcesses = isMultiProcess ? numGpus : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu),
{1,2}, //two group, second group sendrecv to self, has 2 coll
testBed.GetNumStreamsPerGroup(1,2),
2);

for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
Expand All @@ -115,6 +131,8 @@ namespace RcclUnitTesting
for (int recvRank = 0; recvRank < totalRanks; ++recvRank)
{
options.root = recvRank;
int groupCallId = sendRank == recvRank;
int recvId = sendRank == recvRank;
testBed.SetCollectiveArgs(ncclCollSend,
dataTypes[dataIdx],
numElements[numIdx],
Expand All @@ -125,36 +143,45 @@ namespace RcclUnitTesting
sendRank);
if (recvRank == 0)
{
testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank, userRegistered);
testBed.PrepareData(groupCallId, 0, sendRank);
}
if (recvRank != sendRank)
{
if (testBed.ev.showNames) // Show test names
INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
isMultiProcess ? "MP" : "SP",
ncclDataTypeNames[dataTypes[dataIdx]],
sendRank,
recvRank,
numElements[numIdx]);

options.root = sendRank;
testBed.SetCollectiveArgs(ncclCollRecv,
testBed.SetCollectiveArgs(ncclCollSend,
dataTypes[dataIdx],
numElements[numIdx],
numElements[numIdx],
options,
0,
groupCallId,
recvRank);
testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank, userRegistered);
testBed.PrepareData(groupCallId, 0, recvRank);
testBed.ExecuteCollectives({sendRank, recvRank});
testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank);
testBed.DeallocateMem(groupCallId, 0, recvRank);
!groupCallId,
sendRank);
testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank, userRegistered);
testBed.PrepareData(0, 0, sendRank);
testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank, userRegistered);
testBed.PrepareData(1, 0, sendRank);
}

if (testBed.ev.showNames) // Show test names
INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
isMultiProcess ? "MP" : "SP",
ncclDataTypeNames[dataTypes[dataIdx]],
sendRank,
recvRank,
numElements[numIdx]);

options.root = sendRank;
testBed.SetCollectiveArgs(ncclCollRecv,
dataTypes[dataIdx],
numElements[numIdx],
numElements[numIdx],
options,
recvId,
groupCallId,
recvRank);
testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank, userRegistered);
testBed.PrepareData(groupCallId, recvId, recvRank);
testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId);
testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank);
testBed.DeallocateMem(groupCallId, recvId, recvRank);
}
testBed.DeallocateMem(groupCallId, 0, sendRank);
testBed.DeallocateMem(0, 0, sendRank);
testBed.DeallocateMem(1, 0, sendRank);
}
testBed.DestroyComms();
}
Expand Down
12 changes: 6 additions & 6 deletions test/common/TestBedChild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,8 @@ namespace RcclUnitTesting
{
CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx];
CHECK_CALL(collArg.AllocateMem(inPlace, useManagedMem, userRegistered));
if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv))
CHILD_NCCL_CALL(ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister");
if (this->verbose) INFO("Rank %d on child %d allocates memory for collective %d in group %d on device %d (%s,%s,%s) Input: %p Output %p\n",
globalRank, this->childId, collIdx, groupId, this->deviceIds[localRank],
inPlace ? "in-place" : "out-of-place",
Expand Down Expand Up @@ -646,8 +648,6 @@ namespace RcclUnitTesting
"ncclAllToAllv");
break;
case ncclCollSend:
if (collArg.userRegistered)
CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister");
CHILD_NCCL_CALL_RANK(errCode, ncclSend(
collArg.inputGpu.ptr,
collArg.numInputElements,
Expand All @@ -658,8 +658,6 @@ namespace RcclUnitTesting
"ncclSend");
break;
case ncclCollRecv:
if (collArg.userRegistered)
CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.outputGpu.ptr, collArg.numOutputBytesAllocated, &(collArg.commRegHandle)), "ncclCommRegister");
CHILD_NCCL_CALL_RANK(errCode, ncclRecv(
collArg.outputGpu.ptr,
collArg.numOutputElements,
Expand Down Expand Up @@ -891,15 +889,17 @@ namespace RcclUnitTesting
for (int collIdx = 0; collIdx < collArgs[groupId][localRank].size(); ++collIdx)
{
CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx];
if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv))
CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister");
if (collId == -1 || collId == collIdx)
{
if (this->verbose)
{
INFO("Child %d release memory for collective %d in group %d (Input: %p Output %p\n",
this->childId, collIdx, groupId, collArg.inputGpu.ptr, collArg.outputGpu.ptr);
}
if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv))
{
CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister");
}

CHECK_CALL(collArg.DeallocateMem());
}
Expand Down

0 comments on commit 9a0e6a1

Please sign in to comment.