diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e3564e3aeb..811ad5f86e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,7 +61,7 @@ set(VLLM_ASCEND_CUSTOM_OP ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp ) -set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE +set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE_310P ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/bgmv_expand.cpp ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/bgmv_shrink.cpp ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/sgmv_expand.cpp @@ -70,10 +70,21 @@ set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp ) +set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE_ASCEND950 + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp +) + if(SOC_VERSION MATCHES "ascend310p.*") message(STATUS "310P hardware detected: disabling MLAPO operators") message(STATUS "310P hardware detected: excluding batch_matmul_transpose operators") - list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE}) + list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE_310P}) +endif() + +if(SOC_VERSION MATCHES "ascend950") + message(STATUS "A5 hardware detected: disabling MLAPO operators") + message(STATUS "A5 hardware detected: excluding batch_matmul_transpose operators") + list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE_ASCEND950}) endif() ascendc_library(vllm_ascend_kernels SHARED diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_dynamic_quant.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_dynamic_quant.h index 9d77c5e2e44..dfff54cc7b9 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_dynamic_quant.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_dynamic_quant.h @@ -99,9 +99,9 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant::SortCompute() { LocalTensor expertIdxLocal = inLocal[0]; LocalTensor expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast(); Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM; if (duplicateNum > 0) { int duplicateIndex = this->totalLength - duplicateNum; @@ -110,38 +110,38 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant::SortCompute() { mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM); uint64_t mask[2] = {mask0, 0}; Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } LocalTensor concatLocal; LocalTensor tempTensor = tempBuffer.Get(GetSortLen(this->sortNum_)); Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast(); LocalTensor sortedLocal = sortedBuffer.Get(GetSortLen(this->sortNum_)); Sort(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor(); expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor(); LocalTensor expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast(); Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast(), RoundMode::CAST_ROUND, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor expandedExpertIdxLocalInt32; expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast(); Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); expandedExpertIdxCopyOutQueue_.EnQue(expandedExpertIdxLocalInt32); LocalTensor expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor(); LocalTensor expandedRowIdxU32 = expandedRowIdx.ReinterpretCast(); Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); ArithProgression(inLocal[this->sortNum_], 0, 1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); if (duplicateNum > 0) { int duplicateIndex = this->totalLength - duplicateNum; uint64_t mask0 = UINT64_MAX; @@ -149,14 +149,14 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant::SortCompute() { mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM); uint64_t mask[2] = {mask0, 0}; Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Sort(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); expandedRowIdxCopyOutQueue_.EnQue(expandedRowIdx); sortDataCopyInQueue.FreeTensor(inLocal); } @@ -227,31 +227,31 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant::Compute(LocalTensor& if constexpr (!IsSameType::value) { Cast(inLocal, inLocal.ReinterpretCast()[colsAlign], RoundMode::CAST_NONE, this->cols_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } if (smoothType != 0) { Mul(inLocal, inLocal, smoothLocal, this->cols_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } Abs(tempLocal, inLocal, this->cols_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f; Duplicate(dynamicQuantLocal, maxValue, 8); Duplicate(tempLocal, maxValue, this->cols_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Div(tempLocal, inLocal, tempLocal, this->cols_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(tempLocal.ReinterpretCast(), tempLocal, RoundMode::CAST_TRUNC, this->cols_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(outLocal, tempLocal.ReinterpretCast(), RoundMode::CAST_ROUND, this->cols_); diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant.h index 4d83d6642f1..8884889ac00 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant.h @@ -56,34 +56,34 @@ __aicore__ inline void MoeV2FullLoadQuant::Compute(int64_t xLocalLength) { uint32_t elements = Align(this->cols, sizeof(int8_t)) * xLocalLength; if constexpr (IsSameType::value) { Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(halfLocal, halfLocal, static_cast(this->scale), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Adds(halfLocal, halfLocal, static_cast(this->offset), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor intLocal = floatLocal.ReinterpretCast(); Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); SetDeqScale((half)1.000000e+00f); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements); } else if constexpr (IsSameType::value) { Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(halfLocal, halfLocal, static_cast(this->scale), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Adds(halfLocal, halfLocal, static_cast(this->offset), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements); } else { Muls(inLocal, inLocal, static_cast(this->scale), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Adds(inLocal, inLocal, static_cast(this->offset), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements); } inputXCopyOutQueue.EnQue(outLocal); diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant_base.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant_base.h index 8e8195c995a..7cc61fda894 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant_base.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant_base.h @@ -94,9 +94,9 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() { LocalTensor expertIdxLocal = inLocal[0]; LocalTensor expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast(); Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM; if (duplicateNum > 0) { int duplicateIndex = this->totalLength - duplicateNum; @@ -105,38 +105,38 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() { mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM); uint64_t mask[2] = {mask0, 0}; Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } LocalTensor concatLocal; LocalTensor tempTensor = tempBuffer.Get(GetSortLen(this->sortNum)); Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor rowIdxLocal = inLocal[this->sortNum].template ReinterpretCast(); LocalTensor sortedLocal = sortedBuffer.Get(GetSortLen(this->sortNum)); Sort(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue.AllocTensor(); LocalTensor expandDstToSrcRowLocal = expandDstToSrcRowQueue.AllocTensor(); LocalTensor expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast(); Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast(), RoundMode::CAST_ROUND, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor expandedExpertIdxLocalInt32; expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast(); Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); expandedExpertIdxCopyOutQueue.EnQue(expandedExpertIdxLocalInt32); LocalTensor expandedRowIdx = expandedRowIdxCopyOutQueue.AllocTensor(); LocalTensor expandedRowIdxU32 = expandedRowIdx.ReinterpretCast(); Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); ArithProgression(inLocal[this->sortNum], 0, 1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); if (duplicateNum > 0) { int duplicateIndex = this->totalLength - duplicateNum; uint64_t mask0 = UINT64_MAX; @@ -144,14 +144,14 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() { mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM); uint64_t mask[2] = {mask0, 0}; Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Sort(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); expandedRowIdxCopyOutQueue.EnQue(expandedRowIdx); sortDataCopyInQueue.FreeTensor(inLocal); diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h index 64852f31315..6eced5e9ded 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h @@ -122,31 +122,31 @@ __aicore__ inline void MoeV2GatherDynamicQuant::Compute(LocalTensor& s if constexpr (!IsSameType::value) { Cast(inLocal, inLocal.ReinterpretCast()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } if (smoothType != 0) { Mul(inLocal, inLocal, smoothLocal, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } Abs(tempLocal, inLocal, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f; Duplicate(dynamicQuantLocal, maxValue, 8); Duplicate(tempLocal, maxValue, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Div(tempLocal, inLocal, tempLocal, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(tempLocal.ReinterpretCast(), tempLocal, RoundMode::CAST_TRUNC, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(outLocal, tempLocal.ReinterpretCast(), RoundMode::CAST_ROUND, this->cols); @@ -285,16 +285,16 @@ __aicore__ inline float MoeV2GatherDynamicQuant::ComputeMax(LocalTensor::value) { Cast(inLocal, inLocal.ReinterpretCast()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } if (smoothType != 0) { Mul(inLocal, inLocal, smoothLocal, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } Abs(tempLocal, inLocal, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength); @@ -319,13 +319,13 @@ __aicore__ inline void MoeV2GatherDynamicQuant::ComputeScale(LocalTensor(); Duplicate(tempLocal, scaleTemp, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Div(tempLocal, inLocal, tempLocal, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(tempLocal.ReinterpretCast(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(outLocal, tempLocal.ReinterpretCast(), RoundMode::CAST_ROUND, colsTileLength); diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_quant.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_quant.h index 68b7c927afb..f938fac94da 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_quant.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_quant.h @@ -95,34 +95,34 @@ __aicore__ inline void MoeV2GatherQuant::Compute() { uint32_t elements = Align(this->colsTileLength, sizeof(T)); if constexpr (IsSameType::value) { Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(halfLocal, halfLocal, static_cast(this->scale), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Adds(halfLocal, halfLocal, static_cast(this->offset), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor intLocal = floatLocal.ReinterpretCast(); Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); SetDeqScale((half)1.000000e+00f); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements); } else if constexpr (IsSameType::value) { Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(halfLocal, halfLocal, static_cast(this->scale), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Adds(halfLocal, halfLocal, static_cast(this->offset), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements); } else { Muls(inLocal, inLocal, static_cast(this->scale), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Adds(inLocal, inLocal, static_cast(this->offset), elements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements); } inputXCopyOutQueue.EnQue(outLocal); diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_init_routing_fullload.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_init_routing_fullload.h index 539bc69c58f..31709e06aa6 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_init_routing_fullload.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_init_routing_fullload.h @@ -88,9 +88,9 @@ __aicore__ inline void MoeV2FullLoad::SortCompute() { LocalTensor expertIdxLocal = inLocal[0]; LocalTensor expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast(); Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM; if (duplicateNum > 0) { int duplicateIndex = this->totalLength - duplicateNum; @@ -99,38 +99,38 @@ __aicore__ inline void MoeV2FullLoad::SortCompute() { mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM); uint64_t mask[2] = {mask0, 0}; Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } LocalTensor concatLocal; LocalTensor tempTensor = tempBuffer.Get(GetSortLen(this->sortNum_)); Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast(); LocalTensor sortedLocal = sortedBuffer.Get(GetSortLen(this->sortNum_)); Sort(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor(); LocalTensor expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor(); LocalTensor expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast(); Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast(), RoundMode::CAST_ROUND, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor expandedExpertIdxLocalInt32; expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast(); Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); expandedExpertIdxCopyOutQueue_.EnQue(expandedExpertIdxLocalInt32); LocalTensor expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor(); LocalTensor expandedRowIdxU32 = expandedRowIdx.ReinterpretCast(); Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); ArithProgression(inLocal[this->sortNum_], 0, 1, this->totalLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); if (duplicateNum > 0) { int duplicateIndex = this->totalLength - duplicateNum; uint64_t mask0 = UINT64_MAX; @@ -138,14 +138,14 @@ __aicore__ inline void MoeV2FullLoad::SortCompute() { mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM); uint64_t mask[2] = {mask0, 0}; Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Sort(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); expandedRowIdxCopyOutQueue_.EnQue(expandedRowIdx); sortDataCopyInQueue.FreeTensor(inLocal); diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_mrgsort_out.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_mrgsort_out.h index f08e56de0db..4c6ee76e7ea 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_mrgsort_out.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_mrgsort_out.h @@ -168,9 +168,9 @@ __aicore__ inline void MoeV2MrgsortOut::UpdateSortInfo() { __aicore__ inline void MoeV2MrgsortOut::Extract() { AscendC::Extract(this->ubOutput1, this->ubOutput2, this->tempBuffer, Ceil(curLoopSortedNum, ONE_REPEAT_SORT_NUM)); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(this->ubOutput1, this->ubOutput1, (float)-1, Align(curLoopSortedNum, sizeof(float))); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(this->ubOutputInt1, this->ubOutput1, RoundMode::CAST_ROUND, Align(curLoopSortedNum, sizeof(float))); } diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_multi_core.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_multi_core.h index 8484e837a3e..8db224ef12c 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_multi_core.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_multi_core.h @@ -106,9 +106,9 @@ __aicore__ inline void MoeV2SortMultiCore::UBSortCompute(int64_t progress, int64 expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast(); Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, sortNum); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, sortNum); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); int64_t duplicateNum = size % ONE_REPEAT_SORT_NUM; if (duplicateNum > 0) { @@ -118,7 +118,7 @@ __aicore__ inline void MoeV2SortMultiCore::UBSortCompute(int64_t progress, int64 mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM); uint64_t mask[2] = {mask0, 0}; Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } LocalTensor concatLocal = expertForSourceRowLocalFp32; diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_one_core.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_one_core.h index 0778308d944..c370ef0e21d 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_one_core.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_one_core.h @@ -56,9 +56,9 @@ __aicore__ inline void MoeV2SortOneCore::SortCompute() { LocalTensor expertForSourceRowLocal = inLocal[0]; LocalTensor expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast(); Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, this->tileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, this->tileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM; if (duplicateNum > 0) { @@ -68,28 +68,28 @@ __aicore__ inline void MoeV2SortOneCore::SortCompute() { mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM); uint64_t mask[2] = {mask0, 0}; Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } LocalTensor concatLocal; LocalTensor tempTensor = tempBuffer.Get(GetSortLen(this->sortNum)); Concat(concatLocal, expertForSourceRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor sortedLocal = sortedBuffer.Get(GetSortLen(this->sortNum)); LocalTensor sourceRowLocal; sourceRowLocal = inLocal[this->sortNum].ReinterpretCast(); Sort(sortedLocal, concatLocal, sourceRowLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor outLocal = sortDataCopyOutQueue.AllocTensor(); LocalTensor sortedExpertForSourceRowLocal = outLocal[0]; LocalTensor expandDstToSrcRowLocal; expandDstToSrcRowLocal = outLocal[this->sortNum].ReinterpretCast(); Extract(sortedExpertForSourceRowLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(sortedExpertForSourceRowLocal, sortedExpertForSourceRowLocal, (float)-1, this->tileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor expertForSourceRowLocalInt32; expertForSourceRowLocalInt32 = sortedExpertForSourceRowLocal.ReinterpretCast(); diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_and_gather.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_and_gather.h index 2fb99194ce1..a327eb4eb91 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_and_gather.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_and_gather.h @@ -164,31 +164,31 @@ __aicore__ inline void MoeV2SrcToDstAndGather::Compute(int32_t sr if constexpr (!IsSameType::value) { Cast(inLocal, inLocal.template ReinterpretCast()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } if (smoothType != 0) { Mul(inLocal, inLocal, smoothLocal, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } Abs(tempLocal, inLocal, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f; Duplicate(dynamicQuantLocal, maxValue, 8); Duplicate(tempLocal, maxValue, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Div(tempLocal, inLocal, tempLocal, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(tempLocal.ReinterpretCast(), tempLocal, RoundMode::CAST_TRUNC, this->cols); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(outLocal, tempLocal.ReinterpretCast(), RoundMode::CAST_ROUND, this->cols); @@ -274,7 +274,7 @@ __aicore__ inline float MoeV2SrcToDstAndGather::ComputeMax(LocalT if constexpr (!IsSameType::value) { Cast(inLocal, inLocal.ReinterpretCast()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } if (smoothType != 0) { @@ -284,11 +284,11 @@ __aicore__ inline float MoeV2SrcToDstAndGather::ComputeMax(LocalT smoothLocal = smoothInQueue.DeQue(); Mul(inLocal, inLocal, smoothLocal, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } Abs(tempLocal, inLocal, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength); @@ -314,13 +314,13 @@ __aicore__ inline void MoeV2SrcToDstAndGather::ComputeScale(Local inLocal = inputXInQueue.DeQue(); Duplicate(tempLocal, scaleTemp, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Div(tempLocal, inLocal, tempLocal, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(tempLocal.ReinterpretCast(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(outLocal, tempLocal.ReinterpretCast(), RoundMode::CAST_ROUND, colsTileLength); diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_op.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_op.h index 521a032e9b2..657378b4d53 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_op.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_op.h @@ -75,13 +75,13 @@ __aicore__ inline void MoeV2SrcToDstOp::Compute(int64_t progress) { LocalTensor outLocal = copyOutQueue.AllocTensor(); LocalTensor assistTensor = assistBuffer.Get(ASSIST_NUM); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); int64_t loops = Ceil(currentLoopRows, ASSIST_INDEX_NUM); for (int64_t i = 0; i < loops; i++) { Adds(outLocal[i * ASSIST_NUM], assistTensor, static_cast(this->perLoopRows * progress + i * ASSIST_INDEX_NUM), ASSIST_NUM); } - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); copyOutQueue.EnQue(outLocal); } diff --git a/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp b/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp index 93d4c9e7d4e..1adc5e3a683 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp +++ b/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp @@ -226,7 +226,7 @@ class HcclShmem { AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(RECV_SYNC_EVENT_ID); AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(SEND_SYNC_EVENT_ID); AscendC::CrossCoreWaitFlag(SEND_SYNC_EVENT_ID); - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); ctrBuffer.SetValue(0, epStateValue_); AscendC::SetFlag(EVENT_ID0); diff --git a/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/raw_distributed/cam_moe_distribute_combine.h b/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/raw_distributed/cam_moe_distribute_combine.h index c80678e4674..9fb468bb795 100644 --- a/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/raw_distributed/cam_moe_distribute_combine.h +++ b/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/raw_distributed/cam_moe_distribute_combine.h @@ -264,7 +264,7 @@ __aicore__ inline void CamMoeDistributeCombine::Init( DataCacheCleanAndInvalid( selfDataStatusTensor[coreIdx_ * UB_ALIGN]); __asm__ __volatile__(""); - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); workspaceGM_ = workspaceGM; expandXGM_.SetGlobalBuffer((__gm__ ExpandXType *)expandX); @@ -480,13 +480,13 @@ __aicore__ inline void CamMoeDistributeCombine::ReduceScatt template __aicore__ inline void CamMoeDistributeCombine::SetWaitTpStatusAndDisPatch() { - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); if (startRankId_ >= epWorldSize_) { return; } if constexpr (IsNeedReduceScatter) { uint32_t tpToRankId = 1 - tpRankId_; - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); LocalTensor statusFlagUb = readStateBuf_.Get(); statusFlagUb(0) = sumTarget_; SyncFunc(); @@ -604,9 +604,9 @@ __aicore__ inline void CamMoeDistributeCombine::CustomAdd(L if constexpr (AscendC::IsSameType::value) { Cast(winTpSendCountFloatTensor_, src0, RoundMode::CAST_NONE, dataCnt); Cast(gmTpSendCountFloatTensor_, src1, RoundMode::CAST_NONE, dataCnt); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Add(winTpSendCountFloatTensor_, winTpSendCountFloatTensor_, gmTpSendCountFloatTensor_, dataCnt); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(dst, winTpSendCountFloatTensor_, RoundMode::CAST_ROUND, dataCnt); } else { Add(dst, src0, src1, dataCnt); @@ -616,7 +616,7 @@ __aicore__ inline void CamMoeDistributeCombine::CustomAdd(L template __aicore__ inline void CamMoeDistributeCombine::SetStatus() { - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); if (startRankId_ >= epWorldSize_) { return; } diff --git a/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/raw_distributed/cam_moe_distribute_dispatch.h b/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/raw_distributed/cam_moe_distribute_dispatch.h index 1cc430bdeb5..20660075d02 100644 --- a/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/raw_distributed/cam_moe_distribute_dispatch.h +++ b/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/raw_distributed/cam_moe_distribute_dispatch.h @@ -253,7 +253,7 @@ __aicore__ inline void CamMoeDistributeDispatch::Init( DataCacheCleanAndInvalid( selfDataStatusTensor[aivId_ * UB_ALIGN]); __asm__ __volatile__(""); - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); axisBS_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.bs; axisH_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.h; epWorldSize_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.epRankSize; @@ -568,7 +568,7 @@ __aicore__ inline void CamMoeDistributeDispatch::Allto } tableLocalTensor_((tokenIndex / axisK_ + 1) * moeExpertRankNumAligned_ + expertId) = 1; } - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); uint32_t sendTokenNum = expertIdsCnt / moeUsedAivNum_; uint32_t remainderTokenNum = expertIdsCnt % moeUsedAivNum_; @@ -587,7 +587,7 @@ __aicore__ inline void CamMoeDistributeDispatch::Allto Add(tableInt16LocalTensor_[row * moeExpertRankNumInt16Aligned_], tableInt16LocalTensor_[row * moeExpertRankNumInt16Aligned_], tableInt16LocalTensor_[(row - 1) * moeExpertRankNumInt16Aligned_], moeExpertRankNumInt16Aligned_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } // row-i of tableLocalTensor_ is index of token @@ -655,7 +655,7 @@ __aicore__ inline void CamMoeDistributeDispatch::Allto template __aicore__ inline void CamMoeDistributeDispatch::SetStatus() { - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); SyncAll(); totalExpertNum_ = sharedExpertRankNum_ + moeExpertNum_; sendExpertNum_ = totalExpertNum_ / aivNum_; @@ -695,7 +695,7 @@ __aicore__ inline void CamMoeDistributeDispatch::Quant floatLocalTemp = receiveDataCastFloatBuf_.Get(); Cast(floatLocalTemp, xInTensor_, RoundMode::CAST_NONE, axisH_); xInQueue_.FreeTensor(xInTensor_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); if constexpr (IsSmoothScaleExist) { if constexpr (DynamicQuant) { SyncFunc(); @@ -703,28 +703,28 @@ __aicore__ inline void CamMoeDistributeDispatch::Quant DataCopy(smoothScalesTensor_, scalesGMTensor_[expertIndex * axisH_], axisH_); SyncFunc(); Mul(floatLocalTemp, floatLocalTemp, smoothScalesTensor_, axisH_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } if constexpr (DynamicQuant) { LocalTensor floatLocalAbsTemp = smoothScalesBuf_.Get(); rowMaxTensor_ = rowMaxBuf_.Get(); Abs(floatLocalAbsTemp, floatLocalTemp, axisH_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); ReduceMax(rowMaxTensor_, floatLocalAbsTemp, floatLocalAbsTemp, axisH_, false); SyncFunc(); dynamicScale = float(127.0) / rowMaxTensor_.GetValue(0); SyncFunc(); Muls(floatLocalTemp, floatLocalTemp, dynamicScale, axisH_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } LocalTensor halfLocalTemp = floatLocalTemp.ReinterpretCast(); LocalTensor int32LocalTemp = floatLocalTemp.ReinterpretCast(); Cast(int32LocalTemp, floatLocalTemp, RoundMode::CAST_RINT, axisH_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); SetDeqScale((half)1.000000e+00f); PipeBarrier(); Cast(halfLocalTemp, int32LocalTemp, RoundMode::CAST_ROUND, axisH_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(xOutTensor_, halfLocalTemp, RoundMode::CAST_TRUNC, axisH_); floatLocalTemp = xOutTensor_.template ReinterpretCast(); floatLocalTemp.SetValue(axisH_ / sizeof(float), float(1.0) / dynamicScale); // int8->float32 @@ -742,10 +742,10 @@ __aicore__ inline void CamMoeDistributeDispatch::Local xQueue_.EnQue(xTmpTensor_); xTmpTensor_ = xQueue_.DeQue(); if constexpr (DynamicQuant || StaticQuant) { - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); xOutFp32Tensor_ = xTmpTensor_.template ReinterpretCast(); dynamicScalesTensor_.SetValue(dynamicScalesLocalIdx++, xOutFp32Tensor_.GetValue(axisH_ / sizeof(float))); - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); } if constexpr (IsNeedAllgater) { DataCopy(winTpGatherOutGMTensor_[tokenOffset * axisH_], xTmpTensor_, axisH_); @@ -791,7 +791,7 @@ __aicore__ inline void CamMoeDistributeDispatch::WaitD SyncFunc(); GatherMask(gatherMaskOutTensor, statusFp32Tensor_, gatherTmpTensor, true, mask, {1, (uint16_t)recStatusNumPerCore, 1, 0}, rsvdCnt); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Sum(statusSumOutTensor, gatherMaskOutTensor, sumParams); SyncFunc(); sumOfFlag = statusSumOutTensor.GetValue(0); @@ -929,11 +929,11 @@ __aicore__ inline void CamMoeDistributeDispatch::Local xQueue_.EnQue(xTmpTensor_); xTmpTensor_ = xQueue_.DeQue(); if constexpr (DynamicQuant || StaticQuant) { - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); xOutFp32Tensor_ = xTmpTensor_.template ReinterpretCast(); DataCopyPad(dynamicScalesOutGMTensor_[beginIdx + j], xOutFp32Tensor_[axisH_ / sizeof(float)], dataCopyParamsFloat); - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); } if constexpr (IsNeedAllgater) { DataCopy(winTpGatherOutGMTensor_[(beginIdx + j) * axisHCommu_], xTmpTensor_, axisHCommu_); @@ -963,7 +963,7 @@ __aicore__ inline void CamMoeDistributeDispatch::Local template __aicore__ inline void CamMoeDistributeDispatch::AllGatherSetStatusAndWait() { - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); if (startExpertId_ >= totalExpertNum_) { return; } diff --git a/csrc/kernels/bgmv_expand.cpp b/csrc/kernels/bgmv_expand.cpp index 8afcd92d21e..4f3a99fdf45 100644 --- a/csrc/kernels/bgmv_expand.cpp +++ b/csrc/kernels/bgmv_expand.cpp @@ -187,7 +187,7 @@ class BGMVExpand { } } else { Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) { for (int32_t j = 0; j < maxLoRARank_; j++) { @@ -219,15 +219,15 @@ class BGMVExpand { AscendC::LocalTensor yInLocal = inQueueY_.DeQue(); AscendC::LocalTensor yInLocalFP32 = inBufferY_.Get(); Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); inQueueY_.FreeTensor(yInLocal); Add(yLocal, yLocal, yInLocalFP32, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); AscendC::LocalTensor yOutLocal = outQueueY_.AllocTensor(); Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); outQueueY_.EnQue(yOutLocal); } @@ -243,40 +243,40 @@ class BGMVExpand { AscendC::LocalTensor wTmpTensor = tmpBufferW_.Get(); Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); inQueueW_.FreeTensor(wLocal); Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); if (maxLoRARank_ == LORA_RANK_8) { BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } else if (maxLoRARank_ == LORA_RANK_16) { BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } else if (maxLoRARank_ == LORA_RANK_32) { BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } else if (maxLoRARank_ == LORA_RANK_64) { BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } } diff --git a/csrc/kernels/bgmv_shrink.cpp b/csrc/kernels/bgmv_shrink.cpp index 0dea13e8e88..1a62a1898e5 100644 --- a/csrc/kernels/bgmv_shrink.cpp +++ b/csrc/kernels/bgmv_shrink.cpp @@ -92,7 +92,7 @@ class BGMVShrink { AscendC::LocalTensor xTmpTensor = tmpBufferX_.Get(); AscendC::LocalTensor xLocal = inQueueX_.DeQue(); Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); inQueueX_.FreeTensor(xLocal); } @@ -141,20 +141,20 @@ class BGMVShrink { AscendC::LocalTensor xLocal = inQueueX_.DeQue(); Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements); Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); inQueueX_.FreeTensor(xLocal); inQueueW_.FreeTensor(wLocal); } else { Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); inQueueW_.FreeTensor(wLocal); } // dot product of the one tile of X and W Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); // reduce sum generate one number, which is the summation of all the dot product ReduceSum(wTmpTensor, wTmpTensor, wTmpTensor, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); acc += wTmpTensor.GetValue(0); } @@ -180,7 +180,7 @@ class BGMVShrink { AscendC::LocalTensor yOutLocal = outQueueY_.AllocTensor(); Muls(yOutLocal, yLocal, scale_, maxLoRARank_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); outQueueY_.EnQue(yOutLocal); } diff --git a/csrc/kernels/sgmv_expand.cpp b/csrc/kernels/sgmv_expand.cpp index 65c32f96298..2ab05722976 100644 --- a/csrc/kernels/sgmv_expand.cpp +++ b/csrc/kernels/sgmv_expand.cpp @@ -198,7 +198,7 @@ class SGMVExpand { } } else { Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) { for (int32_t j = 0; j < maxLoRARank_; j++) { @@ -230,15 +230,15 @@ class SGMVExpand { AscendC::LocalTensor yInLocal = inQueueY_.DeQue(); AscendC::LocalTensor yInLocalFP32 = inBufferY_.Get(); Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); inQueueY_.FreeTensor(yInLocal); Add(yLocal, yLocal, yInLocalFP32, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); AscendC::LocalTensor yOutLocal = outQueueY_.AllocTensor(); Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); outQueueY_.EnQue(yOutLocal); } @@ -254,40 +254,40 @@ class SGMVExpand { AscendC::LocalTensor wTmpTensor = tmpBufferW_.Get(); Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); inQueueW_.FreeTensor(wLocal); Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); if (maxLoRARank_ == LORA_RANK_8) { BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } else if (maxLoRARank_ == LORA_RANK_16) { BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } else if (maxLoRARank_ == LORA_RANK_32) { BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } else if (maxLoRARank_ == LORA_RANK_64) { BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT, reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } } diff --git a/csrc/kernels/sgmv_shrink.cpp b/csrc/kernels/sgmv_shrink.cpp index 49f21e0b9d4..3d241ce887a 100644 --- a/csrc/kernels/sgmv_shrink.cpp +++ b/csrc/kernels/sgmv_shrink.cpp @@ -94,7 +94,7 @@ class SGMVShrink { AscendC::LocalTensor xTmpTensor = tmpBufferX_.Get(); AscendC::LocalTensor xLocal = inQueueX_.DeQue(); Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); inQueueX_.FreeTensor(xLocal); } @@ -153,20 +153,20 @@ class SGMVShrink { AscendC::LocalTensor xLocal = inQueueX_.DeQue(); Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements); Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); inQueueX_.FreeTensor(xLocal); inQueueW_.FreeTensor(wLocal); } else { Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); inQueueW_.FreeTensor(wLocal); } // dot product of the one tile of X and W Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); // reduce sum generate one number, which is the summation of all the dot product ReduceSum(wTmpTensor, wTmpTensor, wTmpTensor, numElements); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); acc += wTmpTensor.GetValue(0); } @@ -192,7 +192,7 @@ class SGMVShrink { AscendC::LocalTensor yOutLocal = outQueueY_.AllocTensor(); Muls(yOutLocal, yLocal, scale_, maxLoRARank_); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); outQueueY_.EnQue(yOutLocal); } diff --git a/csrc/notify_dispatch/op_kernel/notify_dispatch.h b/csrc/notify_dispatch/op_kernel/notify_dispatch.h index 1952a6c304a..4f3c5ca1338 100644 --- a/csrc/notify_dispatch/op_kernel/notify_dispatch.h +++ b/csrc/notify_dispatch/op_kernel/notify_dispatch.h @@ -159,7 +159,7 @@ class NotifyDispatch { for (int i = copyOffset; i < copyOffset + copyLen; ++i) { CpUB2GM((__gm__ int64_t *)(shareAddrs[i]) + rank * FLAG_UNIT_INT_NUM, inputUB, sizeof(int64_t)); } - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); } } diff --git a/csrc/sparse_flash_attention/op_kernel/sparse_flash_attention_service_vector_mla.h b/csrc/sparse_flash_attention/op_kernel/sparse_flash_attention_service_vector_mla.h index 79b49e0a513..d0204d97f10 100644 --- a/csrc/sparse_flash_attention/op_kernel/sparse_flash_attention_service_vector_mla.h +++ b/csrc/sparse_flash_attention/op_kernel/sparse_flash_attention_service_vector_mla.h @@ -458,7 +458,7 @@ __aicore__ inline void SFAVectorService::SoftmaxFlashV2Compute( } else { uint32_t dealRowCountAlign = SFAAlign(dealRowCount, FP32_BLOCK_ELEMENT_NUM); DataCopy(softmaxSumUb[softmaxOutOffset], inSumTensor, dealRowCountAlign); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); DataCopy(softmaxMaxUb[softmaxOutOffset], inMaxTensor, dealRowCountAlign); } } @@ -477,9 +477,9 @@ __aicore__ inline void SFAVectorService::AmlaVecCompute( LocalTensor nUpdateTmp = nTmp[SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)]; Muls(nTmp, softmaxMaxUb[softmaxOutOffset], ((T)(-1.0)) * RECIP_OF_LN2, calCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(nTmp, nTmp, RoundMode::CAST_ROUND, calCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); uint32_t prOutIdx = (info.loop - 1) % (constInfo.preLoadNum); uint32_t PreSoftmaxOutOffset = prOutIdx * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T) + baseOffset; @@ -489,10 +489,10 @@ __aicore__ inline void SFAVectorService::AmlaVecCompute( } else { Sub(nUpdateTmp, nTmp, nValueUb[PreSoftmaxOutOffset], calCount); } - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); // update n(i), DataCopy not support when calCount is not align 32B, so use Adds Adds(nValueUb[softmaxOutOffset], nTmp, ConstInfo::FLOAT_ZERO, calCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); // update softmax res LocalTensor nUpdateTmp2 = nTmp[2 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)]; @@ -500,17 +500,17 @@ __aicore__ inline void SFAVectorService::AmlaVecCompute( LocalTensor tmpCofUb = nTmp[4 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)]; LocalTensor epsUb = nTmp[5 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)]; Muls(nUpdateTmp2, softmaxMaxUb[softmaxOutOffset], RECIP_OF_LN2, calCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Add(nTmp, nUpdateTmp2, nTmp, calCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(nTmp, nTmp, LN2, calCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Exp(nTmp, nTmp, calCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(nTmp_KvT, nTmp, RoundMode::CAST_ROUND, calCount); // fp32->fp16/bf16 - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Cast(nUpdateTmp2, nTmp_KvT, RoundMode::CAST_NONE, calCount); // fp16/bf16->fp32 - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); if (info.s2Idx + 1 == info.curSInnerLoopTimes) { Mul(aMlaSumUb[softmaxOutOffset], softmaxSumUb[softmaxOutOffset], nUpdateTmp2, calCount); } @@ -521,33 +521,33 @@ __aicore__ inline void SFAVectorService::AmlaVecCompute( } LocalTensor nTmp3 = nTmp[6 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)]; Brcb(nTmp3, nUpdateTmp2, (dealRowCount + 7) / 8, {1, 8}); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); RowMuls(mmResUb, mmResUb, nTmp3, dealRowCount, columnCount, actualColumnCount); Div(tmpCofUb, nTmp, nUpdateTmp2, calCount); // cof(i)=tmpS32/tmpS16 if (info.isFirstSInnerLoop) { Duplicate(cofValueUb[softmaxOutOffset], (T)1.0, calCount); // cof_0=1 - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Div(epsUb, cofValueUb[softmaxOutOffset], tmpCofUb, calCount); // 1 / cof(i) } else { - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Div(epsUb, cofValueUb[PreSoftmaxOutOffset], tmpCofUb, calCount); // cof(i - 1) / cof(i) } - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Adds(cofValueUb[softmaxOutOffset], tmpCofUb, ConstInfo::FLOAT_ZERO, calCount); // store cof(i) Adds(epsUb, epsUb, (T)(-1.0), calCount); // cof(i - 1) / cof(i) - 1 - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(epsUb, epsUb, (T)1.5, calCount); // (cof(i - 1) - cof(i)) / cof(i) * 1.5 Maxs(nUpdateTmp, nUpdateTmp, (T)(-30.0), calCount); // N = max(n(i) - n(i-1), -30) - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Adds(epsUb, epsUb, (T)(0.000001), calCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Add(nUpdateTmp, nUpdateTmp, epsUb, calCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Muls(nUpdateTmp, nUpdateTmp, FLOAT_E_SCALAR, calCount); // N = N * pow(2, 23) - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); // nUpdate int32 out LocalTensor tmQue = outputBuff2.Get(); @@ -555,7 +555,7 @@ __aicore__ inline void SFAVectorService::AmlaVecCompute( LocalTensor nInt32Out = tmQue[startRow]; Cast(nInt32Out, nUpdateTmp, RoundMode::CAST_ROUND, dealRowCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); SetFlag(SYNC_OUTPUT_BUF2_FLAG); } @@ -583,18 +583,18 @@ __aicore__ inline void SFAVectorService::DealBmm1ResBaseBlock( ElewiseCompute(info, mmResUb, dealRowCount, columnCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor tmpAFloorUb = tmpBuff1.Get(); LocalTensor softmaxTmpUb = tmpAFloorUb.template ReinterpretCast(); SoftmaxFlashV2Compute(info, mSplitInfo, mmResUb, softmaxTmpUb, startRow, dealRowCount, columnCount, info.actualSingleProcessSInnerSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); AmlaVecCompute(info, mSplitInfo, mmResUb, softmaxTmpUb, startRow, dealRowCount, columnCount, info.actualSingleProcessSInnerSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor tmpMMResCastTensor = outputBuff1.Get(); WaitFlag(SYNC_OUTPUT_BUF1_FLAG); @@ -1197,20 +1197,20 @@ SFAVectorService::DealBmm2ResBaseBlock(const RunInfo &info, const MSplitIn bmm2ResUb.SetSize(vec2ComputeSize); LocalTensor absBmm2ResUb = bmm2ResUb.template ReinterpretCast(); Abs(absBmm2ResUb, tmpBmm2ResUb, vec2ComputeSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); LocalTensor cmpMaskUb = absBmm2ResUb.template ReinterpretCast(); CompareScalar(cmpMaskUb, absBmm2ResUb, (T)1e10, CMPMODE::LE, vec2ComputeSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Select(tmpBmm2ResUb, cmpMaskUb, tmpBmm2ResUb, ConstInfo::FLOAT_ZERO, SELMODE::VSEL_TENSOR_SCALAR_MODE, vec2ComputeSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); uint32_t baseOffset = mSplitInfo.nBufferStartM / 2 + startRow; uint32_t idx = info.loop % (constInfo.preLoadNum); LocalTensor tmpSumUb = v0ValidSizeBuff.Get()[384]; Brcb(tmpSumUb, aMlaSumUb[idx * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T) + baseOffset], (dealRowCount + 7) / 8, {1, 8}); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); RowDivs(bmm2ResUb, tmpBmm2ResUb, tmpSumUb, dealRowCount, columnCount, actualColumnCount); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); SetFlag(SYNC_INPUT_BUF1_FLAG + pingpongFlag); Bmm2ResCopyOut(info, bmm2ResUb, mStart, dealRowCount, columnCount, actualColumnCount); } diff --git a/csrc/utils/inc/kernel/pse.h b/csrc/utils/inc/kernel/pse.h index e6cd8e7b1f0..582ad3e536c 100644 --- a/csrc/utils/inc/kernel/pse.h +++ b/csrc/utils/inc/kernel/pse.h @@ -336,7 +336,7 @@ __aicore__ inline void PseSlopeCopyIn(LocalTensor &dstTensor, LocalTensor(); int64_t s1Offset = pseInfo.s1oIdx * pseInfo.s1BaseSize + pseInfo.vecCoreOffset + pseInfo.loopIdx * pseInfo.vec1S1BaseSize; @@ -345,16 +345,16 @@ __aicore__ inline void PseSlopeCopyIn(LocalTensor &dstTensor, LocalTensor(); Abs(dstTensor, dstTensor, computeSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); float slopes = ((__gm__ T *)pseSlope)[offset] * -1; if (pseInfo.pseType == (uint32_t)PseTypeEnum::PSE_INNER_MUL_ADD_SQRT_TYPE) { Sqrt(dstTensor, dstTensor, computeSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } Muls(dstTensor, dstTensor, slopes, computeSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } } } @@ -373,7 +373,7 @@ __aicore__ inline void PseSlopeCast(LocalTensor &dstTensor, LocalTensor int64_t offset = bOffset + n2Offset + gOffset; int64_t computeSize = pseInfo.vec1S1RealSize * pseInfo.s2AlignedSize; Cast(dstTensor, helpTensor, RoundMode::CAST_NONE, computeSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); int64_t s1Offset = pseInfo.s1oIdx * pseInfo.s1BaseSize + pseInfo.vecCoreOffset + pseInfo.loopIdx * pseInfo.vec1S1BaseSize; @@ -382,16 +382,16 @@ __aicore__ inline void PseSlopeCast(LocalTensor &dstTensor, LocalTensor float posShift = float(s2Offset + pseInfo.kvStartIdx - s1Offset - pseInfo.qStartIdx); Adds(dstTensor, dstTensor, posShift, computeSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); Abs(dstTensor, dstTensor, computeSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); float slopes = ((__gm__ T *)pseSlope)[offset] * -1; if (pseInfo.pseType == (uint32_t)PseTypeEnum::PSE_INNER_MUL_ADD_SQRT_TYPE) { Sqrt(dstTensor, dstTensor, computeSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } Muls(dstTensor, dstTensor, slopes, computeSize); - pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); } } diff --git a/setup.py b/setup.py index d7a8f68cc34..461f76ec5cf 100644 --- a/setup.py +++ b/setup.py @@ -97,8 +97,10 @@ def get_chip_type() -> str: # A3 case assert npu_name return (chip_name + "_" + npu_name).lower() + elif "950" in chip_name: + assert npu_name + return (chip_name + "_" + npu_name).lower() else: - # TODO(zzzzwwjj): Currently, A5's chip name has not determined yet. raise ValueError(f"Unable to recognize chip name: {chip_name}, please manually set env SOC_VERSION") except subprocess.CalledProcessError as e: raise RuntimeError(f"Get chip info failed: {e}") @@ -153,11 +155,14 @@ def gen_build_info(): "ascend310p3vir02": "_310P", "ascend310p3vir04": "_310P", "ascend310p3vir08": "_310P", - "ascend910_9579": "A5", } - - assert soc_version in soc_to_device, f"Undefined soc_version: {soc_version}. Please file an issue to vllm-ascend." - device_type = soc_to_device[soc_version] + if "ascend950" in soc_version: + device_type = "A5" + else: + assert soc_version in soc_to_device, ( + f"Undefined soc_version: {soc_version}. Please file an issue to vllm-ascend." + ) + device_type = soc_to_device[soc_version] package_dir = os.path.join(ROOT_DIR, "vllm_ascend", "_build_info.py") with open(package_dir, "w+") as f: diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index b7d1bae14ff..b3568c6ca5d 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -267,7 +267,10 @@ def enable_custom_op(): # There are some customed operators which aren't implemented # with batch invariant in vllm-ascend, we need to disable them. - if vllm_is_batch_invariant(): + # FIXME(linfeng): Currently custom op compilation and execution are partially available + # in ASCEND950 chip, we temporarily disable all custom ops. Please refer to + # https://github.com/vllm-project/vllm-ascend/issues/7157 for latest update about custom op. + if vllm_is_batch_invariant() or get_ascend_device_type() == AscendDeviceType.A5: _CUSTOM_OP_ENABLED = False return _CUSTOM_OP_ENABLED