Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[submodule "csrc/third_party/catlass"]
path = csrc/third_party/catlass
url = https://gitcode.com/cann/catlass.git
branch = catlass-v1-stable
30 changes: 28 additions & 2 deletions csrc/build_aclnn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
ROOT_DIR=$1
SOC_VERSION=$2

git config --global --add safe.directory "$ROOT_DIR"

if [[ "$SOC_VERSION" =~ ^ascend310 ]]; then
# ASCEND310P series
# currently, no custom aclnn ops for ASCEND310 series
Expand All @@ -11,18 +13,42 @@ if [[ "$SOC_VERSION" =~ ^ascend310 ]]; then
exit 0
elif [[ "$SOC_VERSION" =~ ^ascend910b ]]; then
# ASCEND910B (A2) series
CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention"
CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;dispatch_ffn_combine"
SOC_ARG="ascend910b"
elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then
# ASCEND910C (A3) series
CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention"
CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;dispatch_ffn_combine"
SOC_ARG="ascend910_93"
else
# others
# currently, no custom aclnn ops for other series
exit 0
fi

git submodule init
git submodule update


# For the compatibility of CANN8.5 and CANN8.3: copy and modify moe_distribute_base.h
file_path=$(find /usr/local/Ascend/ascend-toolkit -name "moe_distribute_base.h" 2>/dev/null | head -n1)
if [ -z "$file_path" ]; then
echo "cannot find moe_distribute_base.h file in CANN env"
exit 1
fi

SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
TARGET_DIR="$SCRIPT_DIR/dispatch_ffn_combine/op_kernel/utils/"
TARGET_FILE="$TARGET_DIR/$(basename "$file_path")"

echo "*************************************"
echo $file_path
echo "$TARGET_DIR"
cp "$file_path" "$TARGET_DIR"

sed -i 's/struct HcclOpResParam {/struct HcclOpResParamCustom {/g' "$TARGET_FILE"
sed -i 's/struct HcclRankRelationResV2 {/struct HcclRankRelationResV2Custom {/g' "$TARGET_FILE"


# build custom ops
cd csrc
rm -rf build output
Expand Down
2 changes: 1 addition & 1 deletion csrc/cmake/func.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ function(add_ops_src_copy)
set(_BUILD_FLAG ${SRC_COPY_DST}/${SRC_COPY_TARGET_NAME}.done)
add_custom_command(OUTPUT ${_BUILD_FLAG}
COMMAND mkdir -p ${SRC_COPY_DST}
COMMAND cp -rf ${SRC_COPY_SRC}/op_kernel/*.* ${SRC_COPY_DST}
COMMAND cp -rf ${SRC_COPY_SRC}/op_kernel/* ${SRC_COPY_DST}
COMMAND touch ${_BUILD_FLAG}
)

Expand Down
66 changes: 66 additions & 0 deletions csrc/dispatch_ffn_combine/op_host/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd.
# This file is a part of the CANN Open Software.
# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# ======================================================================================================================

set(_DISPATCH_FFN_INC_OPTS)
if (EXISTS ${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/ascendc/include)
list(APPEND _DISPATCH_FFN_INC_OPTS -I${ASCEND_CANN_PACKAGE_PATH}/aarch64-linux/ascendc/include)
elseif (EXISTS ${ASCEND_CANN_PACKAGE_PATH}/arm64-linux/ascendc/include)
list(APPEND _DISPATCH_FFN_INC_OPTS -I${ASCEND_CANN_PACKAGE_PATH}/arm64-linux/ascendc/include)
elseif (EXISTS ${ASCEND_CANN_PACKAGE_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/ascendc/include)
list(APPEND _DISPATCH_FFN_INC_OPTS -I${ASCEND_CANN_PACKAGE_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/ascendc/include)
endif()
if (EXISTS ${CMAKE_SOURCE_DIR}/third_party/catlass/include)
list(APPEND _DISPATCH_FFN_INC_OPTS -I${CMAKE_SOURCE_DIR}/third_party/catlass/include)
endif()

add_ops_compile_options(
OP_NAME DispatchFFNCombine
OPTIONS --cce-auto-sync=on
-Wno-deprecated-declarations
-Werror
-DHCCL_COMM
${_DISPATCH_FFN_INC_OPTS}
)

target_sources(op_host_aclnnInner PRIVATE
dispatch_ffn_combine_def.cpp
)

target_sources(opapi PRIVATE
aclnn_dispatch_ffn_combine.cpp
)

if (NOT BUILD_OPEN_PROJECT)
target_sources(aclnn_ops_train PRIVATE
aclnn_dispatch_ffn_combine.cpp
)

target_sources(aclnn_ops_infer PRIVATE
aclnn_dispatch_ffn_combine.cpp
)
endif ()

target_sources(optiling PRIVATE
dispatch_ffn_combine_tiling.cpp
)

target_include_directories(optiling PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/../op_kernel
)

target_sources(opsproto PRIVATE
dispatch_ffn_combine_proto.cpp
)

file(GLOB _GMM_Aclnn_header "${CMAKE_CURRENT_SOURCE_DIR}/aclnn_dispatch_ffn_combine.h")

install(FILES ${_GMM_Aclnn_header}
DESTINATION ${ACLNN_INC_INSTALL_DIR} OPTIONAL
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/**
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This file is a part of the CANN Open Software.
* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#include "aclnn_dispatch_ffn_combine.h"
#include <algorithm>
// #include "aclnn_kernels/common/op_error_check.h"
// #include "opdev/op_log.h"
// #include "opdev/common_types.h"
// #include "opdev/platform.h"
// #include "ophost/matmul_util.h"
#include <unistd.h>
#include <vector>
#include <string>
#include <iostream>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <climits>
#include "../op_host/error_log.h"
// using namespace op;

// using namespace op;

#ifdef __cplusplus
extern "C" {
#endif

static constexpr size_t TWO_DIMS = 2;
static constexpr int64_t KVALUE_MIN = 256;
static constexpr int64_t KVALUE_MAX = 65535;
static constexpr size_t HCCL_GROUP_NAME_MAX = 128U;
enum NnopbaseHcclServerType {
NNOPBASE_HCCL_SERVER_TYPE_AICPU = 0,
NNOPBASE_HCCL_SERVER_TYPE_MTE,
NNOPBASE_HCCL_SERVER_TYPE_END
};

extern aclnnStatus aclnnInnerDispatchFFNCombineGetWorkspaceSize(const aclTensor* x, const aclTensor* weight1, const aclTensor* weight2,
const aclTensor* expertId, const aclTensor* scale1, const aclTensor* scale2,
const aclTensor* probs,
const char* group, int64_t maxOutputSize,
bool transB, bool weightNz,
const aclTensor* out,
uint64_t* workspaceSize, aclOpExecutor** executor);
extern aclnnStatus aclnnInnerDispatchFFNCombine(void *workspace, uint64_t workspaceSize,
aclOpExecutor *executor, aclrtStream stream);
extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor, NnopbaseHcclServerType sType);



aclnnStatus aclnnDispatchFFNCombineGetWorkspaceSize(const aclTensor* x, const aclTensor* weight1, const aclTensor* weight2,
const aclTensor* expertId, const aclTensor* scale1, const aclTensor* scale2,
const aclTensor* probs,
const char* group, int64_t maxOutputSize,
const aclTensor* out,
uint64_t* workspaceSize, aclOpExecutor** executor)
{
bool transB = false;
bool weightNz = true;

aclnnStatus ret = aclnnInnerDispatchFFNCombineGetWorkspaceSize(x, weight1, weight2, expertId, scale1, scale2, probs, group,
maxOutputSize, transB, weightNz,
out, workspaceSize, executor);
return ret;
}

aclnnStatus aclnnDispatchFFNCombine(void* workspace, uint64_t workspaceSize, aclOpExecutor *executor, aclrtStream stream)
{
if (NnopbaseSetHcclServerType) {
NnopbaseSetHcclServerType(executor, NNOPBASE_HCCL_SERVER_TYPE_MTE);
}
aclnnStatus ret = aclnnInnerDispatchFFNCombine(workspace, workspaceSize, executor, stream);
return ret;
}
#ifdef __cplusplus
}
#endif
61 changes: 61 additions & 0 deletions csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/**
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This file is a part of the CANN Open Software.
* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

#ifndef OP_API_INC_DISPATCH_FFN_COMBINE_
#define OP_API_INC_DISPATCH_FFN_COMBINE_

#include <string>

#include "aclnn/aclnn_base.h"
#include "hccl/hccl.h"
#include "hccl/hccl_types.h"

#ifdef __cplusplus
extern "C" {
#endif

/**
* 算子功能:实现分布式MoE从InitRouting到Unpermute全部算子的融合
* @brief aclnnDispatchFFNCombine的第一段接口,根据具体的计算流程,计算workspace大小。
* @domain aclnn_ops_infer
* @param [in] a: matmul左矩阵,数据类型支持:float16, bf16。
* @param [in] b: matmul右矩阵,数据类型支持:float16, bf16。
* @param [in] bias: 偏置,数据类型支持:float16, bf16。
* @param [in] group: 标识通信域名称的字符串。
* @param [in] worldsize: 通信域size,支持2/4/8卡。
* @param [in] epRankId: ep本卡Id。取值范围[0, worldSize),各卡的rankId不能重复
* @param [out] c: 计算+通信的结果,数据类型:同输入。
* @param [out] workspaceSize: 返回需要在npu device侧申请的workspace大小。
* @param [out] executor: 返回op执行器,包含了算子计算流程。
* @return aclnnStatus: 返回状态码
*/
__attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombineGetWorkspaceSize(const aclTensor* x, const aclTensor* weight1, const aclTensor* weight2,
const aclTensor* expertId, const aclTensor* scale1, const aclTensor* scale2,
const aclTensor* probs,
const char* group, int64_t maxOutputSize,
const aclTensor* out,
uint64_t* workspaceSize, aclOpExecutor** executor);

/**
* @brief aclnnDispatchGmmCombine的第二段接口,用于执行计算。
* @param [in] workspace: 在npu device侧申请的workspace内存起址。
* @param [in] workspace_size: 在npu device侧申请的workspace大小,由第一段接口aclnnDispatchFFNCombineGetWorkspaceSize获取。
* @param [in] exector: op执行器,包含了算子计算流程。
* @param [in] stream: acl stream流。
* @return aclnnStatus: 返回状态码
*/
__attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombine(void* workspace, uint64_t workspaceSize, aclOpExecutor* executor,
aclrtStream stream);

#ifdef __cplusplus
}
#endif

#endif // OP_API_INC_GMM_ALLTOALLV_
88 changes: 88 additions & 0 deletions csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/**
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This file is a part of the CANN Open Software.
* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

/*!
* \file dispatch_ffn_combine_def.cpp
* \brief
*/
#include "register/op_def_registry.h"

namespace ops {
class DispatchFFNCombine : public OpDef {
public:
explicit DispatchFFNCombine(const char *name) : OpDef(name) {
this->Input("a")
.ParamType(REQUIRED)
.DataType({ge::DT_FLOAT16, ge::DT_BF16, ge::DT_BF16})
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
this->Input("w1")
.ParamType(REQUIRED)
.DataType({ge::DT_INT8, ge::DT_INT8, ge::DT_INT8})
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_FRACTAL_NZ})
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_FRACTAL_NZ})
.IgnoreContiguous();
this->Input("w2")
.ParamType(REQUIRED)
.DataType({ge::DT_INT8, ge::DT_INT8, ge::DT_INT8})
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_FRACTAL_NZ})
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_FRACTAL_NZ})
.IgnoreContiguous();
this->Input("expertIdx")
.ParamType(REQUIRED)
.DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
this->Input("scale1")
.ParamType(REQUIRED)
.DataType({ge::DT_INT64, ge::DT_INT64, ge::DT_INT64})
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
this->Input("scale2")
.ParamType(REQUIRED)
.DataType({ge::DT_INT64, ge::DT_INT64, ge::DT_INT64})
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
this->Input("probs")
.ParamType(REQUIRED)
.DataType({ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});

// 输出
this->Output("out")
.ParamType(REQUIRED)
.DataType({ge::DT_FLOAT16, ge::DT_BF16, ge::DT_BF16})
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND,ge::FORMAT_ND});

this->Attr("group").AttrType(REQUIRED).String();
this->Attr("M").AttrType(OPTIONAL).Int();
this->Attr("transB").AttrType(OPTIONAL).Bool(false);
this->Attr("weightNz").AttrType(OPTIONAL).Bool(false);

OpAICoreConfig aicore_config;
aicore_config.DynamicCompileStaticFlag(true)
.DynamicFormatFlag(true)
.DynamicRankSupportFlag(true)
.DynamicShapeSupportFlag(true)
.NeedCheckSupportFlag(false)
.PrecisionReduceFlag(true)
.ExtendCfgInfo("aclnnSupport.value", "support_aclnn")
.ExtendCfgInfo("jitCompile.flag", "static_false")
.ExtendCfgInfo("multiKernelSupportDynamicGraph.value", "multi_kernel");
this->AICore().AddConfig("ascend910_93", aicore_config);
this->AICore().AddConfig("ascend910b", aicore_config);
this->MC2().HcclGroup("group");
}
};

OP_ADD(DispatchFFNCombine);
} // namespace ops
Loading
Loading