Skip to content

Commit

Permalink
update_run_pretrain_auto.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
xuxinyi389 committed Feb 24, 2025
1 parent 2980efb commit 4653dd4
Showing 1 changed file with 2 additions and 22 deletions.
24 changes: 2 additions & 22 deletions llm/auto_parallel/deepseek-v3/run_pretrain_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,12 @@
set -x
unset CUDA_VISIBLE_DEVICES

mpi_rank=${OMPI_COMM_WORLD_RANK:-0}
node_rank=$((mpi_rank+offset))
mpi_node=${OMPI_COMM_WORLD_SIZE:-1}
echo "MPI status:${mpi_rank}/${mpi_node}"
nnode_train=${nnode_set:-${mpi_node}}
master_train=${master:-localhost}
#
echo "Distributed Training ${node_rank}/${nnode_train} master=${master_train}"
set -x

# 屏蔽平台预设的环境变量,因为框架采用兼容升级,检测到这些配置会使用原方式启动
unset PADDLE_ELASTIC_JOB_ID
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS
unset FLAGS_START_PORT
unset PADDLE_ELASTIC_TIMEOUT
nnodes=$PADDLE_TRAINERS_NUM
rank=$PADDLE_TRAINER_ID

task_name="deepseekv3"
rm -rf output/$task_name/
rm -rf "output/$task_name""_log"

export SOT_LOG_LEVEL=4
export PYTHONPATH=../../../:$PYTHONPATH
export PYTHONPATH=/root/paddlejob/workspace/env_run/xuxinyi/Paddle/build/python:$PYTHONPATH
#ulimit -c unlimited
# export GLOG_v=3

Expand Down Expand Up @@ -95,8 +75,8 @@ python -u -m paddle.distributed.launch \
--data_impl "mmap" \
--enable_auto_parallel 1 \
--max_grad_norm 1.0 \
--num_hidden_layers 5 \
--num_hidden_layers 6 \
--use_intermediate_api true \
--to_static $to_static \
--first_k_dense_replace 6 \
--first_k_dense_replace 7 \
--hidden_size 1792 \

0 comments on commit 4653dd4

Please sign in to comment.