vllm-project · MengqingCao · Mar 12, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
@@ -61,7 +61,7 @@ set(VLLM_ASCEND_CUSTOM_OP
     ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
 )
 
-set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE
+set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE_310P
     ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/bgmv_expand.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/bgmv_shrink.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/sgmv_expand.cpp
@@ -70,10 +70,21 @@ set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE
     ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
 )
 
+set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE_ASCEND950
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
+)
+
 if(SOC_VERSION MATCHES "ascend310p.*")
     message(STATUS "310P hardware detected: disabling MLAPO operators")
     message(STATUS "310P hardware detected: excluding batch_matmul_transpose operators")
-    list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE})
+    list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE_310P})
+endif()
+
+if(SOC_VERSION MATCHES "ascend950")
+    message(STATUS "A5 hardware detected: disabling MLAPO operators")
+    message(STATUS "A5 hardware detected: excluding batch_matmul_transpose operators")
+    list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE_ASCEND950})
 endif()
 
 ascendc_library(vllm_ascend_kernels SHARED

@@ -99,9 +99,9 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
   LocalTensor<int32_t> expertIdxLocal = inLocal[0];
   LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
   Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
   if (duplicateNum > 0) {
     int duplicateIndex = this->totalLength - duplicateNum;
@@ -110,53 +110,53 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
     mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
     uint64_t mask[2] = {mask0, 0};
     Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
   }
   LocalTensor<float> concatLocal;
   LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
   Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
   LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
   Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
   expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
   LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
   Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
        this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   LocalTensor<int32_t> expandedExpertIdxLocalInt32;
   expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
   Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
 
   LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
   LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
   Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   if (duplicateNum > 0) {
     int duplicateIndex = this->totalLength - duplicateNum;
     uint64_t mask0 = UINT64_MAX;
     mask0 = mask0 << duplicateNum;
     mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
     uint64_t mask[2] = {mask0, 0};
     Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
   }
   Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
   sortDataCopyInQueue.FreeTensor(inLocal);
 }
@@ -227,31 +227,31 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Compute(LocalTensor<float>&
 
   if constexpr (!IsSameType<T, float>::value) {
     Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign], RoundMode::CAST_NONE, this->cols_);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
   }
 
   if (smoothType != 0) {
     Mul(inLocal, inLocal, smoothLocal, this->cols_);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
   }
 
   Abs(tempLocal, inLocal, this->cols_);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols_);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
 
   Duplicate<float>(dynamicQuantLocal, maxValue, 8);
   Duplicate<float>(tempLocal, maxValue, this->cols_);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   Div(tempLocal, inLocal, tempLocal, this->cols_);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols_);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols_);
 

@@ -56,34 +56,34 @@ __aicore__ inline void MoeV2FullLoadQuant<T>::Compute(int64_t xLocalLength) {
   uint32_t elements = Align(this->cols, sizeof(int8_t)) * xLocalLength;
   if constexpr (IsSameType<T, bfloat16_t>::value) {
     Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
     Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     SetDeqScale((half)1.000000e+00f);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
   } else if constexpr (IsSameType<T, float>::value) {
     Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
   } else {
     Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
   }
   inputXCopyOutQueue.EnQue(outLocal);

@@ -94,9 +94,9 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
   LocalTensor<int32_t> expertIdxLocal = inLocal[0];
   LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
   Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
   if (duplicateNum > 0) {
     int duplicateIndex = this->totalLength - duplicateNum;
@@ -105,53 +105,53 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
     mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
     uint64_t mask[2] = {mask0, 0};
     Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
   }
   LocalTensor<float> concatLocal;
   LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
   Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum].template ReinterpretCast<uint32_t>();
   LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
   Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue.AllocTensor<float>();
   LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue.AllocTensor<uint32_t>();
   LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
   Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
        this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   LocalTensor<int32_t> expandedExpertIdxLocalInt32;
   expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
   Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   expandedExpertIdxCopyOutQueue.EnQue<int32_t>(expandedExpertIdxLocalInt32);
 
   LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue.AllocTensor<uint32_t>();
   LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
   Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   ArithProgression<int32_t>(inLocal[this->sortNum], 0, 1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   if (duplicateNum > 0) {
     int duplicateIndex = this->totalLength - duplicateNum;
     uint64_t mask0 = UINT64_MAX;
     mask0 = mask0 << duplicateNum;
     mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
     uint64_t mask[2] = {mask0, 0};
     Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
   }
   Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
   expandedRowIdxCopyOutQueue.EnQue<uint32_t>(expandedRowIdx);
   sortDataCopyInQueue.FreeTensor(inLocal);
 

@@ -122,31 +122,31 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Compute(LocalTensor<float>& s
 
   if constexpr (!IsSameType<T, float>::value) {
     Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
   }
 
   if (smoothType != 0) {
     Mul(inLocal, inLocal, smoothLocal, this->cols);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
   }
 
   Abs(tempLocal, inLocal, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
 
   Duplicate<float>(dynamicQuantLocal, maxValue, 8);
   Duplicate<float>(tempLocal, maxValue, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   Div(tempLocal, inLocal, tempLocal, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols);
 
@@ -285,16 +285,16 @@ __aicore__ inline float MoeV2GatherDynamicQuant<T>::ComputeMax(LocalTensor<float
 
   if constexpr (!IsSameType<T, float>::value) {
     Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
   }
 
   if (smoothType != 0) {
     Mul(inLocal, inLocal, smoothLocal, colsTileLength);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
   }
 
   Abs(tempLocal, inLocal, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength);
 
@@ -319,13 +319,13 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::ComputeScale(LocalTensor<floa
   inLocal = inputXInQueue.DeQue<float>();
 
   Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   Div(tempLocal, inLocal, tempLocal, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
 
   Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength);
 

@@ -95,34 +95,34 @@ __aicore__ inline void MoeV2GatherQuant<T>::Compute() {
   uint32_t elements = Align(this->colsTileLength, sizeof(T));
   if constexpr (IsSameType<T, bfloat16_t>::value) {
     Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
     Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     SetDeqScale((half)1.000000e+00f);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
   } else if constexpr (IsSameType<T, float>::value) {
     Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
   } else {
     Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
     Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
   }
   inputXCopyOutQueue.EnQue(outLocal);