Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
383 changes: 383 additions & 0 deletions csrc/kernels/sgmv_expand.cpp

Large diffs are not rendered by default.

267 changes: 267 additions & 0 deletions csrc/kernels/sgmv_shrink.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
/*
* Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "kernel_operator.h"
#include "types.h"

template <typename scalar_t>
class SGMVShrink {
public:
using X_T = scalar_t;
using W_T = scalar_t;
using Y_T = float;

static constexpr uint64_t BUFFER_NUM = 1;
static constexpr uint64_t TILE_LENGTH = 11776; // optimal performance tile length

public:
__aicore__ inline SGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {}
__aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *loraIndices, __gm__ void *seqLen,
__gm__ void *y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
uint32_t maxLoRARank, float scale)
{
batchSize_ = batchSize;
numTokensPerCore_ = numTokensPerCore;
inputHiddenDim_ = inputHiddenDim;
maxLoRARank_ = maxLoRARank;
scale_ = scale;
singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_;
incremental_ = inputHiddenDim_ > TILE_LENGTH;

xGm_.SetGlobalBuffer((__gm__ X_T *)x);
yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y);
wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices);
seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen);

pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T));
pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T));
pipe_->InitBuffer(tmpBufferX_, TILE_LENGTH * sizeof(float));
pipe_->InitBuffer(tmpBufferW_, TILE_LENGTH * sizeof(float));

pipe_->InitBuffer(outQueueY_, 1, maxLoRARank_ * sizeof(Y_T));
pipe_->InitBuffer(outBufferY_, maxLoRARank_ * sizeof(float));
}

__aicore__ inline void Process()
{
int64_t blockIdx = AscendC::GetBlockIdx();
int64_t startIdx = blockIdx * numTokensPerCore_;
int64_t endIdx = startIdx + numTokensPerCore_;
if (endIdx > batchSize_) {
endIdx = batchSize_;
}
for (int64_t idx = startIdx; idx < endIdx; idx++) {
// set up LoRA index
CopyInIndex(idx);
if (reqLoRAIndex_ < 0) {
continue;
}
reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;

if (incremental_) {
ProcessImpl<true>(idx);
} else {
ProcessImpl<false>(idx);
}

ScaleOutput();
CopyOut(idx);
}
}

private:
template <bool INCREMENTAL_MODE>
__aicore__ inline void ProcessImpl(const int64_t idx)
{
AscendC::LocalTensor<float> yOutLocal = outBufferY_.Get<float>();
if constexpr (!INCREMENTAL_MODE) {
CopyInX(idx, 0, inputHiddenDim_);
AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
pipe_barrier(PIPE_V);
inQueueX_.FreeTensor(xLocal);
}

for (int i = 0; i < maxLoRARank_; i++) {
float acc(0);
for (int32_t j = 0; j < inputHiddenDim_ / TILE_LENGTH; j++) {
if constexpr (INCREMENTAL_MODE) {
CopyInX(idx, j);
}
CopyInW(i, j);
Compute<INCREMENTAL_MODE>(acc);
}
CopyAndComputeLastIteration<INCREMENTAL_MODE>(idx, i, acc);
yOutLocal.SetValue(i, acc);
}
}

__aicore__ inline void CopyInIndex(const int64_t idx)
{
// look up the LoRA index
int64_t weightIdx = idx;
uint64_t i = 0;
for (; i < seqLenGm_.GetSize(); i++) {
int64_t repeatValue = seqLenGm_.GetValue(i);
if (weightIdx >= repeatValue) {
weightIdx -= repeatValue;
continue;
}
break;
}
reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1;
}

__aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
{
AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
DataCopy(xLocal, xGm_[inputHiddenDim_ * idx + colIdx * TILE_LENGTH], numElements);
inQueueX_.EnQue(xLocal);
}

__aicore__ inline void CopyInW(int32_t rowIdx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
{
AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + rowIdx * inputHiddenDim_ + colIdx * TILE_LENGTH], numElements);
inQueueW_.EnQue(wLocal);
}

template <bool INCREMENTAL_MODE>
__aicore__ inline void Compute(float &acc, int32_t numElements = TILE_LENGTH)
{
AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();

if constexpr (INCREMENTAL_MODE) {
AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
pipe_barrier(PIPE_V);
inQueueX_.FreeTensor(xLocal);
inQueueW_.FreeTensor(wLocal);
} else {
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
pipe_barrier(PIPE_V);
inQueueW_.FreeTensor(wLocal);
}
// dot product of the one tile of X and W
Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
pipe_barrier(PIPE_V);
// reduce sum generate one number, which is the summation of all the dot product
ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
pipe_barrier(PIPE_V);

acc += wTmpTensor.GetValue(0);
}

template <bool INCREMENTAL_MODE>
__aicore__ inline void CopyAndComputeLastIteration(const int64_t idx, int32_t rowIdx, float &acc)
{
int32_t colIdx = inputHiddenDim_ / TILE_LENGTH;
int32_t remaining = inputHiddenDim_ % TILE_LENGTH;
if (remaining == 0) {
return;
}
if constexpr (INCREMENTAL_MODE) {
CopyInX(idx, colIdx, remaining);
}
CopyInW(rowIdx, colIdx, remaining);
Compute<INCREMENTAL_MODE>(acc, remaining);
}

__aicore__ inline void ScaleOutput()
{
AscendC::LocalTensor<float> yLocal = outBufferY_.Get<float>();
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();

Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
pipe_barrier(PIPE_V);

outQueueY_.EnQue<Y_T>(yOutLocal);
}

__aicore__ inline void CopyOut(const int64_t idx)
{
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
DataCopy(yOutGm_[maxLoRARank_ * idx], yOutLocal, maxLoRARank_);
outQueueY_.FreeTensor(yOutLocal);
}

private:
AscendC::TPipe *pipe_;
AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX_, inQueueW_;
AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueY_;
AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferX_, tmpBufferW_, outBufferY_;
AscendC::GlobalTensor<X_T> xGm_;
AscendC::GlobalTensor<W_T> wGm_;
AscendC::GlobalTensor<int64_t> loraIndicesGm_;
AscendC::GlobalTensor<int64_t> seqLenGm_;
AscendC::GlobalTensor<Y_T> yOutGm_;
uint32_t batchSize_;
uint32_t numTokensPerCore_;
uint32_t inputHiddenDim_;
uint32_t maxLoRARank_;
float scale_;
uint32_t singleLoRAWeightLen_;
int64_t reqLoRAIndex_;
uint64_t reqLoRAWeightOffset_;
bool incremental_;
};

#define SGMV_SHRINK_TYPE_DECLARE(TYPE) \
extern "C" __global__ __aicore__ void sgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight, \
__gm__ void* loraIndices, __gm__ void* seqLen, \
__gm__ void* y, uint32_t batchSize, \
uint32_t numTokensPerCore, uint32_t inputHiddenDim, \
uint32_t maxLoRARank, float scale) \
{ \
AscendC::TPipe pipe; \
SGMVShrink<TYPE> op(&pipe); \
op.Init(x, weight, loraIndices, seqLen,y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale); \
op.Process(); \
}

// declare all dtype kernel
SGMV_SHRINK_TYPE_DECLARE(half)
#if (__CCE_AICORE__ >= 220)
SGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
#endif

namespace vllm_ascend {
extern void sgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, void* loraIndices, void* seqLen,
void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
uint32_t maxLoRARank, float scale)
{
uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
if (type == AscendType::FP16) {
sgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, seqLen, y, batchSize,
numTokensPerCore, inputHiddenDim, maxLoRARank,
scale);
} else if (type == AscendType::BF16) {
#if (__CCE_AICORE__ >= 220)
sgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, seqLen, y, batchSize,
numTokensPerCore, inputHiddenDim, maxLoRARank,
scale);
#endif
} else {
return;
}
}

} // namespace vllm_ascend
30 changes: 30 additions & 0 deletions csrc/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,34 @@ namespace vllm_ascend {
uint32_t output_hidden_dim,
uint32_t slice_offset,
uint32_t output_full_dim);

extern void sgmv_shrink_impl(
AscendType type,
void *stream,
void *x,
void *weight,
void *loraIndices,
void *seqLen,
void *y,
uint32_t batch_size,
uint32_t num_tokens_per_core,
uint32_t input_hidden_dim,
uint32_t lora_rank,
float scale);

extern void sgmv_expand_impl(
AscendType type,
void *stream,
void *x,
void *weight,
void *loraIndices,
void *seqLen,
void *y,
void *y_out,
uint32_t batch_size,
uint32_t num_tokens_per_core,
uint32_t lora_rank,
uint32_t output_hidden_dim,
uint32_t slice_offset,
uint32_t output_full_dim);
}
Loading
Loading