diff --git a/csrc/deepep/ops/op_host/cam_moe_combine_normal_tiling.cc b/csrc/deepep/ops/op_host/cam_moe_combine_normal_tiling.cc index 34071f44d..86952cfd1 100644 --- a/csrc/deepep/ops/op_host/cam_moe_combine_normal_tiling.cc +++ b/csrc/deepep/ops/op_host/cam_moe_combine_normal_tiling.cc @@ -530,19 +530,20 @@ static ge::graphStatus CamMoeCombineNormalA3TilingFuncImpl(gert::TilingContext * uint64_t epWorldSize = static_cast(tilingData->camMoeCombineNormalInfo.epWorldSize); uint64_t k = static_cast(tilingData->camMoeCombineNormalInfo.k); uint64_t perRoundTokens = tilingData->camMoeCombineNormalInfo.perRoundTokens; + uint64_t realMaxBs = tilingData->camMoeCombineNormalInfo.realMaxBs; + uint64_t realBs = std::min(perRoundTokens, realMaxBs); // combine数据区 token首地址对齐512 uint64_t tokenNeedSizeCombine = ((h * MAX_OUT_DTYPE_SIZE + WIN_ADDR_ALIGN - 1UL) / WIN_ADDR_ALIGN) * WIN_ADDR_ALIGN; - uint64_t actualSize = - (perRoundTokens * k * tokenNeedSizeCombine + COMBINE_STATE_WIN_OFFSET + NOTIFY_DISPATCH_WIN_OFFSET) * - DOUBLE_DATA_BUFFER; + uint64_t actualSize = (realBs * k * tokenNeedSizeCombine + COMBINE_STATE_WIN_OFFSET + NOTIFY_DISPATCH_WIN_OFFSET) * + DOUBLE_DATA_BUFFER; OP_TILING_CHECK( (actualSize > maxWindowSize), OP_LOGE(nodeName, - "HCCL_BUFFSIZE is too SMALL, perRoundTokens = %lu, h = %lu, epWorldSize = %lu, localMoeExpertNum = %u," + "HCCL_BUFFSIZE is too SMALL, realBs = %lu, h = %lu, epWorldSize = %lu, localMoeExpertNum = %u," " tokenNeedSizeCombine = %lu, k = %lu, NEEDED_HCCL_BUFFSIZE(" "((perRoundTokens * k * tokenNeedSizeCombine)) + 8MB + 102MB) * 2) = %luMB, " "HCCL_BUFFSIZE=%luMB.", - perRoundTokens, h, epWorldSize, localMoeExpertNum, tokenNeedSizeCombine, k, actualSize / MB_SIZE + 1UL, + realBs, h, epWorldSize, localMoeExpertNum, tokenNeedSizeCombine, k, actualSize / MB_SIZE + 1UL, maxWindowSize / MB_SIZE), return ge::GRAPH_FAILED); tilingData->camMoeCombineNormalInfo.totalWinSize = maxWindowSize;