Skip to content

Commit

Permalink
[LLM][NPU] reformat npu scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
SylarTiaNII committed Jun 21, 2024
1 parent 0331b4c commit 43c883d
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 11 deletions.
4 changes: 2 additions & 2 deletions llm/export_npu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@

set -x

src_path=${1:-"./llama/npu/output/sft_bf16_llama_N1C8/"}
src_path=${1:-".npu/llama/output/sft_bf16_llama_N1C8/"}
dst_path=${2:-"./inference"}

source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/atb/set_env.sh

export PYTHONPATH=../:$PYTHONPATH
python export_model.py --model_name_or_path ${src_path} --inference_model --output_path ${dst_path} --dtype float16 --device npu --block_attn
python predict/export_model.py --model_name_or_path ${src_path} --inference_model --output_path ${dst_path} --dtype float16 --device npu --block_attn
6 changes: 3 additions & 3 deletions llm/npu/llama/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ cd -
sft为精调策略,我们提供了广告生成数据集demo便于您调试使用
```
#精调:为了方便测试,我们也提供了广告生成数据集可以直接使用:
cd llm/llama/npu
cd llm/npu/llama
wget https://bj.bcebos.com/paddlenlp/datasets/examples/AdvertiseGen.tar.gz
tar -zxvf AdvertiseGen.tar.gz
```
Expand Down Expand Up @@ -145,8 +145,8 @@ bash llama_npu_sft_N1C8.sh
```
为了保障极致压缩的推理成本,我们使用了静态图实现。因此需要从训练产出的动态图模型中导出静态图模型,执行如下命令进行导出:
```
cd ../..
bash export_npu.sh ./llama/npu/output/sft_bf16_llama_N1C8/ ./inference
cd ../../
bash export_npu.sh .npu/llama/output/sft_bf16_llama_N1C8/ ./inference
```
最终,我们通过静态图的模型执行推理:
```
Expand Down
110 changes: 110 additions & 0 deletions llm/npu/llama/export_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse

import numpy as np
import paddle
from tqdm import tqdm


def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", default="inference/model", help="The directory of exported model.")
return parser.parse_args()


def trans_weight(var):
shape = var.desc.shape()
new_shape = [shape[1], shape[0]]
var.desc.set_shape(new_shape)

var_data = np.array(var.get_value())
var.get_value().set(var_data.T, paddle.CPUPlace())


def convert_dequant_scale(var):
deq_scale = np.array(var.get_value()).astype(np.float32)
new_deq_scale = np.stack([deq_scale.reshape(-1, 1), np.zeros_like(deq_scale).reshape(-1, 1)], axis=-1).reshape(-1)
var.get_value().set(np.frombuffer(new_deq_scale.tobytes(), dtype=np.int64), paddle.CPUPlace())


def process_params(model_path):
paddle.enable_static()
exe = paddle.static.Executor(paddle.CPUPlace())

prog = paddle.static.Program()
startup_prog = paddle.static.Program()
scope = paddle.static.Scope()
with paddle.base.scope_guard(scope):
with paddle.base.program_guard(prog, startup_prog):
[program, feed_target_names, fetch_targets] = paddle.static.io.load_inference_model(model_path, exe)

feed_targets = []
for var in program.list_vars():
if var.name in feed_target_names:
feed_targets.append(var)

block = program.global_block()

for op in tqdm(block.ops, desc="processing the linear layer for NPU"):
if op.type == "matmul_v2":
w_name = op.input_arg_names[-1]
if w_name.endswith("qkv_weight") and not op.attr("trans_y"):
op._set_attr("trans_y", True)
w = block.var(w_name)
trans_weight(w)
elif w_name.endswith("out_proj_weight") and not op.attr("trans_y"):
op._set_attr("trans_y", True)
w = block.var(w_name)
trans_weight(w)
elif w_name.endswith("ffn1_weight") and not op.attr("trans_y"):
op._set_attr("trans_y", True)
w = block.var(w_name)
trans_weight(w)
elif w_name.endswith("ffn2_weight") and not op.attr("trans_y"):
op._set_attr("trans_y", True)
w = block.var(w_name)
trans_weight(w)
elif w_name == "llama_lm_head_0.w_0" and not op.attr("trans_y"):
op._set_attr("trans_y", True)
w = block.var(w_name)
trans_weight(w)

for var_name in tqdm(block.vars, desc="processing the dequant layer for NPU"):
if var_name.endswith("qkv_out_scale"):
var = block.var(var_name)
convert_dequant_scale(var)
elif var_name.endswith("linear_out_scale"):
var = block.var(var_name)
convert_dequant_scale(var)
elif var_name.endswith("ffn1_out_scale"):
var = block.var(var_name)
convert_dequant_scale(var)
elif var_name.endswith("ffn2_out_scale"):
var = block.var(var_name)
convert_dequant_scale(var)

paddle.static.save_inference_model(
model_path, feed_targets, fetch_targets, exe, program=program, skip_prune_program=True
)


def main():
args = parse_arguments()
process_params(args.model_path)


if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions llm/npu/llama/llama_npu_lora_N1C8.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ export MC2_Recompute=1

source /usr/local/Ascend/ascend-toolkit/set_env.sh
export PYTHONPATH=../../../:$PYTHONPATH
ps aux | grep finetune_generation.py | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_finetune.py | grep -v grep | awk '{print $2}' | xargs kill -9

python -u -m paddle.distributed.launch \
--devices "0,1,2,3,4,5,6,7" \
--log_dir "./lora_bf16_llama_N1C8" \
../../finetune_generation.py \
../../run_finetune.py \
--device "npu" \
--model_name_or_path "meta-llama/Llama-2-13b-chat" \
--dataset_name_or_path "data/" \
Expand Down
4 changes: 2 additions & 2 deletions llm/npu/llama/llama_npu_sft_N1C8.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ export MC2_Recompute=1

source /usr/local/Ascend/ascend-toolkit/set_env.sh
export PYTHONPATH=../../../:$PYTHONPATH
ps aux | grep finetune_generation.py | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep run_finetune.py | grep -v grep | awk '{print $2}' | xargs kill -9

python -u -m paddle.distributed.launch \
--devices "0,1,2,3,4,5,6,7" \
--log_dir "./sft_bf16_llama_N1C8" \
../../finetune_generation.py \
../../run_finetune.py \
--device "npu" \
--model_name_or_path "meta-llama/Llama-2-13b" \
--dataset_name_or_path "data/" \
Expand Down
2 changes: 1 addition & 1 deletion llm/predict/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def main():
validate_pdmodel(export_args.output_path, predictor_args.model_prefix, predictor_args.device)

if predictor_args.device == "npu":
from llama.npu.export_utils import process_params
from npu.llama.export_utils import process_params

process_params(os.path.join(export_args.output_path, predictor_args.model_prefix))

Expand Down
2 changes: 1 addition & 1 deletion llm/predict_npu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/atb/set_env.sh

export PYTHONPATH=../:$PYTHONPATH
python predictor.py --model_name_or_path ${model_path} --inference_model --dtype "float16" --mode "static" --block_attn --device npu
python predict/predictor.py --model_name_or_path ${model_path} --inference_model --dtype "float16" --mode "static" --block_attn --device npu

0 comments on commit 43c883d

Please sign in to comment.