Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion csrc/build_aclnn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ elif [[ "$SOC_VERSION" =~ ^ascend910b ]]; then
ABSOLUTE_CATLASS_PATH=$(cd "${CATLASS_PATH}" && pwd)
export CPATH=${ABSOLUTE_CATLASS_PATH}:${CPATH}

CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;matmul_allreduce_add_rmsnorm"
CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;matmul_allreduce_add_rmsnorm;moe_init_routing_custom"
SOC_ARG="ascend910b"
elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then
# ASCEND910C (A3) series
Expand Down Expand Up @@ -69,6 +69,7 @@ elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then
"moe_dispatch_normal"
"dispatch_layout"
"notify_dispatch"
"moe_init_routing_custom"
)
CUSTOM_OPS=$(IFS=';'; echo "${CUSTOM_OPS_ARRAY[*]}")
SOC_ARG="ascend910_93"
Expand Down
55 changes: 55 additions & 0 deletions csrc/moe_init_routing_custom/op_host/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd.
# This file is a part of the CANN Open Software.
# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# ======================================================================================================================

add_ops_compile_options(
OP_NAME MoeInitRoutingCustom
OPTIONS --cce-auto-sync=on
-Wno-deprecated-declarations
-Werror
)

target_sources(op_host_aclnnExc PRIVATE
moe_init_routing_custom_def.cpp
)

target_sources(opapi PRIVATE
moe_init_routing_custom.cpp
aclnn_moe_init_routing_custom.cpp
)

if (NOT BUILD_OPEN_PROJECT)
target_sources(aclnn_ops_train PRIVATE
moe_init_routing_custom.cpp
aclnn_moe_init_routing_custom.cpp
)

target_sources(aclnn_ops_infer PRIVATE
moe_init_routing_custom.cpp
aclnn_moe_init_routing_custom.cpp
)
endif ()

target_sources(optiling PRIVATE
moe_init_routing_custom_tiling_base.cpp
moe_init_routing_custom_tiling.cpp
)

target_include_directories(optiling PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
)

target_sources(opsproto PRIVATE
moe_init_routing_custom_infershape.cpp
)

file(GLOB _GMM_Aclnn_header "${CMAKE_CURRENT_SOURCE_DIR}/aclnn_moe_init_routing_custom.h")

install(FILES ${_GMM_Aclnn_header}
DESTINATION ${ACLNN_INC_INSTALL_DIR} OPTIONAL
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
/**
 * This program is free software, you can redistribute it and/or modify.
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This file is a part of the CANN Open Software.
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

#include <algorithm>
#include <tuple>
#include <cstddef>
#include "opdev/make_op_executor.h"
#include "aclnn_kernels/contiguous.h"
#include "opdev/tensor_view_utils.h"
#include "aclnn_kernels/common/op_error_check.h"
#include "opdev/op_log.h"
#include "aclnn_kernels/cast.h"
#include "opdev/common_types.h"
#include "moe_init_routing_custom.h"
#include "aclnn_moe_init_routing_custom.h"

using namespace op;

#ifdef __cplusplus
extern "C" {
#endif

namespace {
static const int64_t MOE_DIM_2 = 2;
static const int64_t MOE_DIM_1 = 1;
}

static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_X= {DataType::DT_FLOAT16, DataType::DT_BF16, DataType::DT_FLOAT, DataType::DT_INT8};
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPERT_IDX = {DataType::DT_INT32};
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_SCALE = {DataType::DT_FLOAT};
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_OFFSET= {DataType::DT_FLOAT};
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPANDED_X_OUT = {DataType::DT_FLOAT16, DataType::DT_BF16, DataType::DT_FLOAT, DataType::DT_INT8};
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPANDED_ROW_IDX_OUT = {DataType::DT_INT32};
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPERT_TOKENS_COUNT_OR_CUMSUMOUT = {DataType::DT_INT64};
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPANDED_SCALE_OUT = {DataType::DT_FLOAT};

static inline bool CheckNotNull(const aclTensor *x,
const aclTensor *expertIdx,
const aclTensor *expandedXOut,
const aclTensor *expandedRowIdxOut,
const aclTensor *expertTokensCountOrCumsumOut,
const aclTensor *expandedScaleOut) {
OP_CHECK_NULL(x, return false);
OP_CHECK_NULL(expertIdx, return false);
OP_CHECK_NULL(expandedXOut, return false);
OP_CHECK_NULL(expandedRowIdxOut, return false);
OP_CHECK_NULL(expertTokensCountOrCumsumOut, return false);
OP_CHECK_NULL(expandedScaleOut, return false);

return true;
}

aclnnStatus aclnnMoeInitRoutingCustomGetWorkspaceSize(const aclTensor *x,
const aclTensor *expertIdx,
const aclTensor *scaleOptional,
const aclTensor *offsetOptional,
int64_t activeNum,
int64_t expertCapacity,
int64_t expertNum,
int64_t dropPadMode,
int64_t expertTokensNumType,
bool expertTokensNumFlag,
int64_t quantMode,
const aclIntArray *activeExpertRangeOptional,
int64_t rowIdxType,
const aclTensor *expandedXOut,
const aclTensor *expandedRowIdxOut,
const aclTensor *expertTokensCountOrCumsumOut,
const aclTensor *expandedScaleOut,
uint64_t *workspaceSize,
aclOpExecutor **executor)
{
L2_DFX_PHASE_1(aclnnMoeInitRoutingCustom,
DFX_IN(x, expertIdx, scaleOptional, offsetOptional,
activeNum, expertCapacity, expertNum, dropPadMode,
expertTokensNumType, expertTokensNumFlag, quantMode, activeExpertRangeOptional, rowIdxType),
DFX_OUT(expandedXOut, expandedRowIdxOut, expertTokensCountOrCumsumOut, expandedScaleOut));
auto ret = CheckNotNull(x, expertIdx, expandedXOut, expandedRowIdxOut,
expertTokensCountOrCumsumOut, expandedScaleOut);

CHECK_RET(ret, ACLNN_ERR_PARAM_NULLPTR);

auto uniqueExecutor = CREATE_EXECUTOR();
CHECK_RET(uniqueExecutor.get() != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);

auto xContiguous = l0op::Contiguous(x, uniqueExecutor.get());
CHECK_RET(xContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
auto expertIdxContiguous = l0op::Contiguous(expertIdx, uniqueExecutor.get());
CHECK_RET(expertIdxContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);

const aclTensor* scaleContiguous = nullptr;
const aclTensor* offsetContiguous = nullptr;
if (scaleOptional != nullptr) {
scaleContiguous = l0op::Contiguous(scaleOptional, uniqueExecutor.get());
CHECK_RET(scaleContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
}

if (offsetOptional != nullptr) {
offsetContiguous = l0op::Contiguous(offsetOptional, uniqueExecutor.get());
CHECK_RET(offsetContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
}

auto routingResult = std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*>(nullptr, nullptr, nullptr, nullptr);
routingResult = l0op::MoeInitRoutingCustom(xContiguous, expertIdxContiguous, scaleContiguous, offsetContiguous,
activeNum, expertCapacity, expertNum, dropPadMode, expertTokensNumType, expertTokensNumFlag,
quantMode, activeExpertRangeOptional, rowIdxType, expandedXOut, expandedRowIdxOut,
expertTokensCountOrCumsumOut, expandedScaleOut, uniqueExecutor.get());
auto [expandedXOut_, expandedRowIdxOut_, expertTokensCountOrCumsumOut_, expandedScaleOut_] = routingResult;
bool hasNullptr = (expandedXOut_ == nullptr) || (expandedRowIdxOut_ == nullptr) || (expertTokensCountOrCumsumOut_ == nullptr) || (expandedScaleOut_ == nullptr);
CHECK_RET(hasNullptr != true, ACLNN_ERR_INNER_NULLPTR);

auto viewCopyExpandedXOutResult = l0op::ViewCopy(expandedXOut_, expandedXOut, uniqueExecutor.get());
CHECK_RET(viewCopyExpandedXOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);
auto viewCopyExpandedRowIdxOutResult = l0op::ViewCopy(expandedRowIdxOut_, expandedRowIdxOut, uniqueExecutor.get());
CHECK_RET(viewCopyExpandedRowIdxOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);

auto viewCopyExpertTokensCountOrCumsumOutResult = l0op::ViewCopy(expertTokensCountOrCumsumOut_, expertTokensCountOrCumsumOut, uniqueExecutor.get());
CHECK_RET(viewCopyExpertTokensCountOrCumsumOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);

auto viewCopyExpandedScaleOutResult = l0op::ViewCopy(expandedScaleOut_, expandedScaleOut, uniqueExecutor.get());
CHECK_RET(viewCopyExpandedScaleOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);

*workspaceSize = uniqueExecutor->GetWorkspaceSize();
uniqueExecutor.ReleaseTo(executor);
return ACLNN_SUCCESS;
}
aclnnStatus aclnnMoeInitRoutingCustom(void* workspace, uint64_t workspaceSize, aclOpExecutor* executor,
aclrtStream stream)
{
L2_DFX_PHASE_2(aclnnMoeInitRoutingCustom);
return CommonOpExecutorRun(workspace, workspaceSize, executor, stream);
}

#ifdef __cplusplus
}
#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/**
 * This program is free software, you can redistribute it and/or modify.
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This file is a part of the CANN Open Software.
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

#ifndef OP_API_INC_MOE_INIT_ROUTING_CUSTOM_H_
#define OP_API_INC_MOE_INIT_ROUTING_CUSTOM_H_

#include "aclnn/aclnn_base.h"

#ifdef __cplusplus
extern "C" {
#endif

__attribute__((visibility("default"))) aclnnStatus aclnnMoeInitRoutingCustomGetWorkspaceSize(const aclTensor *x,
const aclTensor *expertIdx,
const aclTensor *scaleOptional,
const aclTensor *offsetOptional,
int64_t activeNum,
int64_t expertCapacity,
int64_t expertNum,
int64_t dropPadMode,
int64_t expertTokensNumType,
bool expertTokensNumFlag,
int64_t quantMode,
const aclIntArray *activeExpertRangeOptional,
int64_t rowIdxType,
const aclTensor *expandedXOut,
const aclTensor *expandedRowIdxOut,
const aclTensor *expertTokensCountOrCumsumOut,
const aclTensor *expandedScaleOut,
uint64_t *workspaceSize,
aclOpExecutor **executor);

__attribute__((visibility("default"))) aclnnStatus aclnnMoeInitRoutingCustom(void* workspace, uint64_t workspaceSize, aclOpExecutor* executor,
aclrtStream stream);

#ifdef __cplusplus
}
#endif

#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/**
* This program is free software, you can redistribute it and/or modify.
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This file is a part of the CANN Open Software.
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

#include <tuple>
#include "moe_init_routing_custom.h"
#include "opdev/make_op_executor.h"
#include "opdev/op_def.h"
#include "opdev/op_dfx.h"
#include "opdev/op_executor.h"
#include "opdev/op_log.h"
#include "opdev/shape_utils.h"
#include "aclnn_kernels/common/op_error_check.h"

using namespace op;

namespace l0op {
OP_TYPE_REGISTER(MoeInitRoutingCustom);

std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*> MoeInitRoutingCustom(const aclTensor *x, const aclTensor *expertIdx, const aclTensor *scale,
const aclTensor *offset, int64_t activeNum, int64_t expertCapacity,
int64_t expertNum, int64_t dropPadMode, int64_t expertTokensNumType,
bool expertTokensNumFlag, int64_t quantMode, const aclIntArray *activeExpertRange,
int64_t rowIdxType, const aclTensor *expandedX, const aclTensor *expandedRowIdx,
const aclTensor *expertTokensCountOrCumsum, const aclTensor *expandedScale, aclOpExecutor *executor)
{
L0_DFX(MoeInitRoutingCustom, x, expertIdx, scale, offset, activeNum, expertCapacity, expertNum, dropPadMode, expertTokensNumType, expertTokensNumFlag,
quantMode, activeExpertRange, rowIdxType, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale);

auto expandedXOut = executor->AllocTensor(expandedX->GetViewShape(), expandedX->GetDataType(), Format::FORMAT_ND);
auto expandedRowIdxOut = executor->AllocTensor(expandedRowIdx->GetViewShape(), expandedRowIdx->GetDataType(), Format::FORMAT_ND);
auto expertTokensCountOrCumsumOut = executor->AllocTensor(expertTokensCountOrCumsum->GetViewShape(), expertTokensCountOrCumsum->GetDataType(), Format::FORMAT_ND);
auto expandedScaleOut = executor->AllocTensor(expandedScale->GetViewShape(), expandedScale->GetDataType(), Format::FORMAT_ND);
if (expandedXOut == nullptr || expandedRowIdxOut == nullptr || expertTokensCountOrCumsumOut == nullptr || expandedScaleOut == nullptr) {
OP_LOGE(ACLNN_ERR_INNER_NULLPTR, "alloc expandedXOut or expandedRowIdxOut or expertTokensCountOrCumsumOut or expandedScaleOut tensor failed.");
return std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*>(nullptr, nullptr, nullptr, nullptr);
}

ADD_TO_LAUNCHER_LIST_AICORE(
MoeInitRoutingCustom, OP_INPUT(x, expertIdx, scale, offset), OP_OUTPUT(expandedXOut, expandedRowIdxOut, expertTokensCountOrCumsumOut, expandedScaleOut), OP_ATTR(activeNum, expertCapacity, expertNum, dropPadMode, expertTokensNumType, expertTokensNumFlag, quantMode, activeExpertRange, rowIdxType));
return std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*>(expandedXOut, expandedRowIdxOut, expertTokensCountOrCumsumOut, expandedScaleOut); //OP_OUTPUT
}

} // namespace l0op
25 changes: 25 additions & 0 deletions csrc/moe_init_routing_custom/op_host/moe_init_routing_custom.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/**
 * This program is free software, you can redistribute it and/or modify.
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This file is a part of the CANN Open Software.
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

#ifndef OP_API_INC_LEVEL0_MOE_INIT_ROUTING_CUSTOM_H
#define OP_API_INC_LEVEL0_MOE_INIT_ROUTING_CUSTOM_H

#include <tuple>
#include "opdev/op_executor.h"

namespace l0op {
std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*> MoeInitRoutingCustom(const aclTensor *x, const aclTensor *expertIdx, const aclTensor *scale,
const aclTensor *offset, int64_t activeNum, int64_t expertCapacity,
int64_t expertNum, int64_t dropPadMode, int64_t expertTokensNumType,
bool expertTokensNumFlag, int64_t quantMode, const aclIntArray *activeExpertRange,
int64_t rowIdxType, const aclTensor *expandedX, const aclTensor *expandedRowIdx,
const aclTensor *expertTokensCountOrCumsum, const aclTensor *expandedScale, aclOpExecutor *executor);
} // namespace l0op
#endif // OP_API_INC_LEVEL0_MOE_INIT_ROUTING_CUSTOM_H
Loading
Loading