@@ -510,8 +510,9 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
510
510
}
511
511
}
512
512
513
- std::pair<size_t , u_int32_t > ExecutionPlan::Impl::calcSizePerRank (int rank, size_t inputSize, size_t outputSize) const {
514
- std::pair<size_t , u_int32_t > sizePerRank;
513
+ std::pair<size_t , uint32_t > ExecutionPlan::Impl::getSizeAndChunksForRank (int rank, size_t inputSize,
514
+ size_t outputSize) const {
515
+ std::pair<size_t , uint32_t > sizePerRank;
515
516
if (this ->inputChunks .at (rank) == 0 && this ->outputChunks .at (rank) == 0 ) {
516
517
throw mscclpp::Error (" Output or Input chunks must be greater than 0" , mscclpp::ErrorCode::ExecutorError);
517
518
} else if (this ->inputChunks .at (rank) != 0 && this ->outputChunks .at (rank) != 0 ) {
@@ -534,15 +535,15 @@ size_t ExecutionPlan::Impl::getOffset(int rank, size_t inputSize, size_t outputS
534
535
}
535
536
536
537
const int nGroups = this ->chunkGroups .at (rank);
537
- auto sizePerRank = calcSizePerRank (rank, inputSize, outputSize);
538
- uint32_t nInputChunks = sizePerRank .second ;
539
- uint32_t nelems = sizePerRank .first / (alignment * sizeof (uint8_t ));
538
+ auto rankSizeAndChunks = getSizeAndChunksForRank (rank, inputSize, outputSize);
539
+ uint32_t nChunks = rankSizeAndChunks .second ;
540
+ uint32_t nelems = rankSizeAndChunks .first / (alignment * sizeof (uint8_t ));
540
541
if (nelems % nGroups != 0 ) {
541
542
throw Error (" Input size must be a multiple of nGroups" , ErrorCode::ExecutorError);
542
543
}
543
544
544
545
int nelemsPerGroup = nelems / nGroups;
545
- int nChunksPerGroup = nInputChunks / nGroups;
546
+ int nChunksPerGroup = nChunks / nGroups;
546
547
uint32_t minNelems = nelemsPerGroup / nChunksPerGroup;
547
548
uint32_t remainder = nelemsPerGroup % nChunksPerGroup;
548
549
uint32_t groupIdx = chunkIndex / nChunksPerGroup;
@@ -568,9 +569,17 @@ size_t ExecutionPlan::Impl::getNChunkSize(int rank, size_t inputSize, size_t out
568
569
}
569
570
570
571
size_t ExecutionPlan::Impl::getUpperBoundChunkSize (int rank, size_t inputSize, size_t outputSize) const {
571
- auto sizePerRank = calcSizePerRank (rank, inputSize, outputSize);
572
- uint32_t nChunks = sizePerRank.second ;
573
- return (sizePerRank.first + nChunks - 1 ) / nChunks;
572
+ size_t nInputChunks = this ->inputChunks .at (rank);
573
+ size_t nOutputChunks = this ->outputChunks .at (rank);
574
+ size_t inputChunkSize = 0 ;
575
+ size_t outputChunkSize = 0 ;
576
+ if (nInputChunks != 0 ) {
577
+ inputChunkSize = inputSize / nInputChunks;
578
+ }
579
+ if (nOutputChunks != 0 ) {
580
+ outputChunkSize = outputSize / nOutputChunks;
581
+ }
582
+ return std::max (inputChunkSize, outputChunkSize);
574
583
}
575
584
576
585
void ExecutionPlan::Impl::reset () {
0 commit comments