From 51473791c14fea4fb378fe911d53ff78c2cf0cba Mon Sep 17 00:00:00 2001 From: zhangting2020 Date: Tue, 2 Jan 2024 18:19:05 +0800 Subject: [PATCH] pipeline parallel benchmark --- ...n1024_pretrain_bs32_bf16_PP8-mbs16-acc2.sh | 31 +++++++++++++++++++ ...n1024_pretrain_bs32_bf16_PP8-mbs16-acc2.sh | 31 +++++++++++++++++++ .../ce_gpt/benchmark_common/run_benchmark.sh | 2 +- 3 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/N1C8/CE_gpt-345m_seqlen1024_pretrain_bs32_bf16_PP8-mbs16-acc2.sh create mode 100644 tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/N1C8/gpt-345m_seqlen1024_pretrain_bs32_bf16_PP8-mbs16-acc2.sh mode change 100644 => 100755 tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/N1C8/CE_gpt-345m_seqlen1024_pretrain_bs32_bf16_PP8-mbs16-acc2.sh b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/N1C8/CE_gpt-345m_seqlen1024_pretrain_bs32_bf16_PP8-mbs16-acc2.sh new file mode 100644 index 000000000000..b906fc6f4eba --- /dev/null +++ b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/N1C8/CE_gpt-345m_seqlen1024_pretrain_bs32_bf16_PP8-mbs16-acc2.sh @@ -0,0 +1,31 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model_item=CE_gpt-345m_seqlen1024_pretrain +dp_degree=1 +mp_degree=1 +pp_degree=8 +bs_item=32 +fp_item=bf16 +run_mode=PP8-mbs16-acc2 +device_num=N1C8 +max_iter=50000 + +model=gpt +micro_bs=16 + +bash ./test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh +# run +bash ./test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ +${max_iter} 2>&1; diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/N1C8/gpt-345m_seqlen1024_pretrain_bs32_bf16_PP8-mbs16-acc2.sh b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/N1C8/gpt-345m_seqlen1024_pretrain_bs32_bf16_PP8-mbs16-acc2.sh new file mode 100644 index 000000000000..8e14df6b753d --- /dev/null +++ b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/N1C8/gpt-345m_seqlen1024_pretrain_bs32_bf16_PP8-mbs16-acc2.sh @@ -0,0 +1,31 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model_item=CE_gpt-345m_seqlen1024_pretrain +dp_degree=1 +mp_degree=1 +pp_degree=8 +bs_item=32 +fp_item=bf16 +run_mode=PP8-mbs16-acc2 +device_num=N1C8 +max_iter=100 + +model=gpt +micro_bs=16 + +bash ./test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh +# run +bash ./test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ +${max_iter} 2>&1; diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh old mode 100644 new mode 100755 index 00509c21dffc..7570fcaab546 --- a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh @@ -151,7 +151,7 @@ function _train(){ run_pretrain.py ${train_cmd}" workerlog_id=0 ;; - DP8-mbs2-acc2|SD8-stage1-mbs2-acc2|SD8-stage2-mbs2-acc2|SD8-stage3-mbs2-acc2|MP2-SD4-stage1-mbs4-acc2|MP2-SP2-PP2-DP2-mbs8-acc2|MP8-mbs16-acc2) echo "run run_mode: ${run_mode}" + DP8-mbs2-acc2|SD8-stage1-mbs2-acc2|SD8-stage2-mbs2-acc2|SD8-stage3-mbs2-acc2|PP8-mbs16-acc2|MP2-SD4-stage1-mbs4-acc2|MP2-SP2-PP2-DP2-mbs8-acc2|MP8-mbs16-acc2) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ run_pretrain.py ${train_cmd}" workerlog_id=0