Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class MoeDistributeCombineV2Layered
constexpr static uint32_t WEIGHT_VALUE_NUM = 16U;
constexpr static uint64_t GM2IPC_SYNC_FLAG = 12345ULL;
constexpr static uint64_t RDMA_TOKEN_ARRIVED_FLAG = 123ULL;
constexpr static uint32_t NOTIFY_DATA_SIZE = 400U * 1024U * 1024U;
constexpr static uint64_t RDMA_TOKEN_END_FLAG = 321ULL;
constexpr static uint32_t MAX_BS_NUM = 512U; // 适配bs=512
constexpr static uint32_t FLAG_SINGLE_CNT = 4;
Expand Down Expand Up @@ -402,7 +403,7 @@ __aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFun
qp_info_ = (__gm__ HcclAiRMAInfo *)(((__gm__ HcclA2CombineOpParam *)contextGM)->aiRMAInfo);

halfWinSize_ = RDMA_DATA_SIZE / 2U;
IPC_DATA_SIZE = winContext_->winSize - RDMA_DATA_SIZE - IPC_DATA_OFFSET;
IPC_DATA_SIZE = winContext_->winSize - RDMA_DATA_SIZE - IPC_DATA_OFFSET - NOTIFY_DATA_SIZE;
dataSpaceSize_ = halfWinSize_ - STATE_SPACE_SIZE;
windowInGM_ = hccl_.GetWindowsInAddr(rankId_);
bufferIdGlobal_.SetGlobalBuffer((__gm__ uint32_t *)(windowInGM_ + dataSpaceSize_ + worldSize_ * STATE_OFFSET));
Expand Down Expand Up @@ -433,7 +434,7 @@ __aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFun

uint64_t winSizeMin =
moeExpertNum_ * axisBS_ * (axisHExpandXTypeSize_ + EXTRA_TOKEN_INFO_NUM * axisK_ * sizeof(uint32_t)) +
IPC_DATA_OFFSET + RDMA_DATA_SIZE; // 考虑负载极其不均衡时,HCCL BUFFSIZE需要开的大小
IPC_DATA_OFFSET + RDMA_DATA_SIZE + NOTIFY_DATA_SIZE; // 考虑负载极其不均衡时,HCCL BUFFSIZE需要开的大小

assert(winContext_->winSize >= winSizeMin,
"The HCCL_BUFFSIZE is %lluMB, the min value should be %lluMB. \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class MoeDistributeDispatchV2Layered
constexpr static uint32_t ARRIVAL_STATUS = 2;
constexpr static uint32_t SKIP_STATUS = 3;
constexpr static uint32_t EXTRA_TOKEN_INFO_NUM = 4U; // 专家信息 权重信息 量化Scale 到达标志位
constexpr static uint32_t NOTIFY_DATA_SIZE = 400U * 1024U * 1024U;

template <AscendC::HardEvent event>
__aicore__ inline void SyncFunc()
Expand Down Expand Up @@ -246,7 +247,7 @@ __aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFu

uint64_t winSizeMin =
moeExpertNum_ * axisBS_ * (axisH_ * sizeof(XType) + EXTRA_TOKEN_INFO_NUM * alignK_ * sizeof(uint32_t)) +
IPC_DATA_OFFSET + RDMA_DATA_SIZE; // 考虑负载极其不均衡时,HCCL BUFFSIZE需要开的大小
IPC_DATA_OFFSET + RDMA_DATA_SIZE + NOTIFY_DATA_SIZE; // 考虑负载极其不均衡时,HCCL BUFFSIZE需要开的大小
assert(winContext_->winSize >= winSizeMin,
"The HCCL_BUFFSIZE is %lluMB, the min value should be %lluMB. \
epWorldSize:%u, epRankId:%u, moeExpertNum:%u, quantMode:%u, globalBs:%u, bs:%u, k:%u, h:%u, aivNum:%u, \
Expand All @@ -264,7 +265,8 @@ __aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFu
bufferId_ = bufferChosenGlobal_(0);
windowInGM_ = windowInGM_ + halfWinSize_ * bufferId_;
windowOutGM_ = windowOutGM_ + halfWinSize_ * bufferId_;
RANK_SIZE_ON_IPC = (totalSize_ - totalWinSize_ - IPC_DATA_OFFSET) / (localMoeExpertNum_ * worldSize_);
RANK_SIZE_ON_IPC =
(totalSize_ - totalWinSize_ - IPC_DATA_OFFSET - NOTIFY_DATA_SIZE) / (localMoeExpertNum_ * worldSize_);
RANK_SIZE_ON_IPC = (RANK_SIZE_ON_IPC / IPC_BUFF_ALIGN) * IPC_BUFF_ALIGN;

// IPC buffer init
Expand Down