Skip to content

Commit 2ef070e

Browse files
authored
Fix nccl-test failure issue (#421) (#429)
1 parent 882cf68 commit 2ef070e

File tree

2 files changed

+19
-10
lines changed

2 files changed

+19
-10
lines changed

src/executor/execution_plan.cc

+18-9
Original file line numberDiff line numberDiff line change
@@ -510,8 +510,9 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
510510
}
511511
}
512512

513-
std::pair<size_t, u_int32_t> ExecutionPlan::Impl::calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const {
514-
std::pair<size_t, u_int32_t> sizePerRank;
513+
std::pair<size_t, uint32_t> ExecutionPlan::Impl::getSizeAndChunksForRank(int rank, size_t inputSize,
514+
size_t outputSize) const {
515+
std::pair<size_t, uint32_t> sizePerRank;
515516
if (this->inputChunks.at(rank) == 0 && this->outputChunks.at(rank) == 0) {
516517
throw mscclpp::Error("Output or Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError);
517518
} else if (this->inputChunks.at(rank) != 0 && this->outputChunks.at(rank) != 0) {
@@ -534,15 +535,15 @@ size_t ExecutionPlan::Impl::getOffset(int rank, size_t inputSize, size_t outputS
534535
}
535536

536537
const int nGroups = this->chunkGroups.at(rank);
537-
auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
538-
uint32_t nInputChunks = sizePerRank.second;
539-
uint32_t nelems = sizePerRank.first / (alignment * sizeof(uint8_t));
538+
auto rankSizeAndChunks = getSizeAndChunksForRank(rank, inputSize, outputSize);
539+
uint32_t nChunks = rankSizeAndChunks.second;
540+
uint32_t nelems = rankSizeAndChunks.first / (alignment * sizeof(uint8_t));
540541
if (nelems % nGroups != 0) {
541542
throw Error("Input size must be a multiple of nGroups", ErrorCode::ExecutorError);
542543
}
543544

544545
int nelemsPerGroup = nelems / nGroups;
545-
int nChunksPerGroup = nInputChunks / nGroups;
546+
int nChunksPerGroup = nChunks / nGroups;
546547
uint32_t minNelems = nelemsPerGroup / nChunksPerGroup;
547548
uint32_t remainder = nelemsPerGroup % nChunksPerGroup;
548549
uint32_t groupIdx = chunkIndex / nChunksPerGroup;
@@ -568,9 +569,17 @@ size_t ExecutionPlan::Impl::getNChunkSize(int rank, size_t inputSize, size_t out
568569
}
569570

570571
size_t ExecutionPlan::Impl::getUpperBoundChunkSize(int rank, size_t inputSize, size_t outputSize) const {
571-
auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
572-
uint32_t nChunks = sizePerRank.second;
573-
return (sizePerRank.first + nChunks - 1) / nChunks;
572+
size_t nInputChunks = this->inputChunks.at(rank);
573+
size_t nOutputChunks = this->outputChunks.at(rank);
574+
size_t inputChunkSize = 0;
575+
size_t outputChunkSize = 0;
576+
if (nInputChunks != 0) {
577+
inputChunkSize = inputSize / nInputChunks;
578+
}
579+
if (nOutputChunks != 0) {
580+
outputChunkSize = outputSize / nOutputChunks;
581+
}
582+
return std::max(inputChunkSize, outputChunkSize);
574583
}
575584

576585
void ExecutionPlan::Impl::reset() {

src/include/execution_plan.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ struct ExecutionPlan::Impl {
113113
bool isInPlace;
114114

115115
private:
116-
std::pair<size_t, u_int32_t> calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const;
116+
std::pair<size_t, uint32_t> getSizeAndChunksForRank(int rank, size_t inputSize, size_t outputSize) const;
117117
size_t getOffset(int rank, size_t inputSize, size_t outputSize, uint32_t chunkIndex, uint32_t alignment = 16) const;
118118
size_t getNChunkSize(int rank, size_t inputSize, size_t outputSize, uint32_t nChunks,
119119
const std::vector<uint32_t> offsets) const;

0 commit comments

Comments
 (0)