microsoft · Binyang2014 · Dec 13, 2024 · Dec 7, 2024 · Dec 7, 2024 · Dec 7, 2024
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
@@ -165,20 +165,32 @@ std::vector<BufferType> ExecutionPlan::Impl::getConnectedBufferTypes(int rank) c
   }
   return std::vector<BufferType>(bufferTypes.begin(), bufferTypes.end());
 }
-size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize, size_t outputSize) const {
-  size_t sizePerRank;
-  if (this->inputChunks.at(rank) != 0)
-    sizePerRank = inputSize / this->inputChunks.at(rank);
-  else if (this->outputChunks.at(rank) != 0)
-    sizePerRank = outputSize / this->outputChunks.at(rank);
+
+size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const {
+  size_t sizePerChunk = 0;
+  size_t inputChunks = this->inputChunks.at(rank);
+  if (inputChunks != 0)
+    sizePerChunk = (inputSize + inputChunks - 1) / this->inputChunks.at(rank);
   else
-    throw mscclpp::Error("Output or Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError);
+    throw mscclpp::Error("Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError);
 
   if (this->isUsingPacket) {
-    return sizePerRank * this->scratchChunks.at(rank) * 2 /* data + flag*/ * 2 /*double buffer*/;
+    return sizePerChunk * this->scratchChunks.at(rank) * 2 /* data + flag*/ * 2 /*double buffer*/;
   }
-  return sizePerRank * this->scratchChunks.at(rank);
+  return sizePerChunk * this->scratchChunks.at(rank);
 }
+
+size_t ExecutionPlan::Impl::getMaxScratchBufferSize(int rank) const {
+  size_t sizePerChunk = 0;
+  size_t inputChunks = this->inputChunks.at(rank);
+  if (inputChunks != 0)
+    sizePerChunk = (this->maxMessageSize + inputChunks - 1) / inputChunks;
+  else
+    throw mscclpp::Error("Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError);
+
+  return this->getScratchBufferSize(rank, sizePerChunk * this->inputChunks.at(rank));
+}
+
 std::vector<Operation> ExecutionPlan::Impl::getOperations(int rank, int threadblock) const {
   return this->operations.at(rank)[threadblock];
 }

diff --git a/src/executor/executor.cc b/src/executor/executor.cc
@@ -140,8 +140,8 @@ struct Executor::Impl {
 
   ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t inputMessageSize,
                                          size_t outputMessageSize, size_t constSrcOffset, size_t constDstOffset,
-                                         size_t sendBufferSize, size_t recvBufferSize, const ExecutionPlan& plan) {
-    ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.impl_->name};
+                                         size_t sendMemRange, size_t recvMemRange, const ExecutionPlan& plan) {
+    ExecutionContextKey key = {sendbuff, recvbuff, sendMemRange, recvMemRange, plan.impl_->name};
     DeviceExecutionPlanKey devicePlanKey = {inputMessageSize, outputMessageSize, constSrcOffset, constDstOffset};
     if (this->contexts.find(key) != this->contexts.end()) {
       auto& devicePlans = this->contexts[key].deviceExecutionPlans;
@@ -167,7 +167,8 @@ struct Executor::Impl {
     plan.impl_->loadExecutionPlan(inputMessageSize, outputMessageSize, constSrcOffset, constDstOffset);
 
     ExecutionContext context;
-    size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize, recvBufferSize);
+    size_t maxScratchBufferSize = plan.impl_->getMaxScratchBufferSize(rank);
+    size_t scratchBufferSize = std::min(plan.impl_->getScratchBufferSize(rank, sendMemRange), maxScratchBufferSize);
     std::shared_ptr<char> scratchBuffer;
     if (isNvlsSupported()) {
       scratchBuffer = allocSharedPhysicalCuda<char>(scratchBufferSize);
@@ -179,8 +180,8 @@ struct Executor::Impl {
     context.proxyService = std::make_shared<ProxyService>();
     context.nthreadsPerBlock = plan.impl_->getNThreadsPerBlock();
     this->setupConnections(context, rank, plan);
-    this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
-    this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
+    this->setupRegisteredMemories(context, sendbuff, recvbuff, sendMemRange, recvMemRange, rank, plan);
+    this->setupChannels(context, sendbuff, recvbuff, sendMemRange, recvMemRange, rank, plan);
     this->setupNvlsChannels(context, sendbuff, recvbuff, rank, plan);
     this->setupDeviceExecutionPlan(context, devicePlanKey, rank, plan);
     context.deviceExecutionPlansBuffers[devicePlanKey] =
@@ -433,16 +434,16 @@ Executor::Executor(std::shared_ptr<Communicator> comm) : impl_(std::make_unique<
 void Executor::execute(int rank, void* sendbuff, void* recvbuff, size_t sendBuffSize,
                        [[maybe_unused]] size_t recvBuffSize, DataType dataType, const ExecutionPlan& plan,
                        cudaStream_t stream, PacketType packetType) {
-  size_t sendBytes, recvBytes;
+  size_t sendMemRange, recvMemRange;
   CUdeviceptr sendBasePtr, recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)sendbuff));
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)recvbuff));
+  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendMemRange, (CUdeviceptr)sendbuff));
+  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvMemRange, (CUdeviceptr)recvbuff));
   size_t offsetIn = (char*)sendbuff - (char*)sendBasePtr;
   size_t offsetOut = (char*)recvbuff - (char*)recvBasePtr;
 
   ExecutionContext context =
       this->impl_->setupExecutionContext(rank, (void*)sendBasePtr, (void*)recvBasePtr, sendBuffSize, recvBuffSize,
-                                         offsetIn, offsetOut, sendBytes, recvBytes, plan);
+                                         offsetIn, offsetOut, sendMemRange, recvMemRange, plan);
   this->impl_->launchKernel(context, rank, sendbuff, recvbuff, dataType, stream, packetType);
 }
 

diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
@@ -72,7 +72,8 @@ struct ExecutionPlan::Impl {
   std::vector<NvlsInfo> getNvlsInfos(int rank) const;
   std::vector<int> getConnectedPeers(int rank) const;
   std::vector<BufferType> getConnectedBufferTypes(int rank) const;
-  size_t getScratchBufferSize(int rank, size_t inputSize, size_t outputSize) const;
+  size_t getScratchBufferSize(int rank, size_t inputSize) const;
+  size_t getMaxScratchBufferSize(int rank) const;
   std::vector<Operation> getOperations(int rank, int threadblock) const;
   int getThreadblockCount(int rank) const;
   int getNThreadsPerBlock() const;