@@ -48,13 +48,19 @@ train_config=conf/train_conformer.yaml
48
48
cmvn=true
49
49
dir=exp/conformer
50
50
checkpoint=
51
+ num_workers=8
52
+ prefetch=500
51
53
52
54
# use average_checkpoint will get better result
53
55
average_checkpoint=true
54
56
decode_checkpoint=$dir /final.pt
55
57
average_num=30
56
58
decode_modes=" ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
57
59
60
+ deepspeed=false
61
+ deepspeed_config=conf/ds_stage2.json
62
+ deepspeed_save_states=" model_only"
63
+
58
64
. tools/parse_options.sh || exit 1;
59
65
60
66
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
@@ -116,11 +122,12 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
116
122
# You have to rm `INIT_FILE` manually when you resume or restart a
117
123
# multi-machine training.
118
124
INIT_FILE=$dir /ddp_init
125
+ rm -f ${INIT_FILE} # remove previous INIT_FILE
119
126
init_method=file://$( readlink -f $INIT_FILE )
120
127
echo " $0 : init method is $init_method "
121
128
num_gpus=$( echo $CUDA_VISIBLE_DEVICES | awk -F " ," ' {print NF}' )
122
129
# Use "nccl" if it works, otherwise use "gloo"
123
- dist_backend=" gloo "
130
+ dist_backend=" nccl "
124
131
world_size=` expr $num_gpus \* $num_nodes `
125
132
echo " total gpus is: $world_size "
126
133
cmvn_opts=
@@ -130,30 +137,60 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
130
137
# train.py rewrite $train_config to $dir/train.yaml with model input
131
138
# and output dimension, and $dir/train.yaml will be used for inference
132
139
# and export.
133
- for (( i = 0 ; i < $num_gpus ; ++ i)) ; do
134
- {
135
- gpu_id=$( echo $CUDA_VISIBLE_DEVICES | cut -d' ,' -f$[$i +1])
136
- # Rank of each gpu/process used for knowing whether it is
137
- # the master of a worker.
138
- rank=` expr $node_rank \* $num_gpus + $i `
139
- python wenet/bin/train.py --gpu $gpu_id \
140
- --config $train_config \
141
- --data_type $data_type \
142
- --symbol_table $dict \
143
- --train_data data/$train_set /data.list \
144
- --cv_data data/dev/data.list \
145
- ${checkpoint: +--checkpoint $checkpoint } \
146
- --model_dir $dir \
147
- --ddp.init_method $init_method \
148
- --ddp.world_size $world_size \
149
- --ddp.rank $rank \
150
- --ddp.dist_backend $dist_backend \
151
- --num_workers 1 \
152
- $cmvn_opts \
153
- --pin_memory
154
- } &
155
- done
156
- wait
140
+ if [ ${deepspeed} == true ]; then
141
+ echo " using deepspeed"
142
+ # NOTE(xcsong): deepspeed fails with gloo, see
143
+ # https://github.com/microsoft/DeepSpeed/issues/2818
144
+ dist_backend=" nccl"
145
+ [ ! -f data/$train_set /data.list.filter ] && \
146
+ python tools/filter_uneven_data.py data/$train_set /data.list \
147
+ $data_type $num_gpus $num_utts_per_shard data/$train_set /data.list.filter
148
+ deepspeed --include localhost:$CUDA_VISIBLE_DEVICES \
149
+ wenet/bin/train.py \
150
+ --deepspeed \
151
+ --deepspeed_config ${deepspeed_config} \
152
+ --deepspeed.save_states ${deepspeed_save_states} \
153
+ --ddp.dist_backend $dist_backend \
154
+ --ddp.init_method $init_method \
155
+ --data_type $data_type \
156
+ --config $train_config \
157
+ --symbol_table data/dict/lang_char.txt \
158
+ --train_data data/$train_set /data.list.filter \
159
+ --cv_data data/dev/data.list \
160
+ ${checkpoint: +--checkpoint $checkpoint } \
161
+ --model_dir $dir \
162
+ --num_workers ${num_workers} \
163
+ --prefetch ${prefetch} \
164
+ $cmvn_opts \
165
+ --pin_memory
166
+ else
167
+ echo " using torch ddp"
168
+ for (( i = 0 ; i < $num_gpus ; ++ i)) ; do
169
+ {
170
+ gpu_id=$( echo $CUDA_VISIBLE_DEVICES | cut -d' ,' -f$[$i +1])
171
+ # Rank of each gpu/process used for knowing whether it is
172
+ # the master of a worker.
173
+ rank=` expr $node_rank \* $num_gpus + $i `
174
+ python wenet/bin/train.py --gpu $gpu_id \
175
+ --config $train_config \
176
+ --data_type $data_type \
177
+ --symbol_table $dict \
178
+ --train_data data/$train_set /data.list \
179
+ --cv_data data/dev/data.list \
180
+ ${checkpoint: +--checkpoint $checkpoint } \
181
+ --model_dir $dir \
182
+ --ddp.init_method $init_method \
183
+ --ddp.world_size $world_size \
184
+ --ddp.rank $rank \
185
+ --ddp.dist_backend $dist_backend \
186
+ --num_workers ${num_workers} \
187
+ --prefetch ${prefetch} \
188
+ $cmvn_opts \
189
+ --pin_memory
190
+ } &
191
+ done
192
+ wait
193
+ fi
157
194
fi
158
195
159
196
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
@@ -171,8 +208,8 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
171
208
# non-streaming model. The default value is -1, which is full chunk
172
209
# for non-streaming inference.
173
210
decoding_chunk_size=
174
- ctc_weight=0.5
175
- reverse_weight=0.0
211
+ ctc_weight=0.3
212
+ reverse_weight=0.5
176
213
for mode in ${decode_modes} ; do
177
214
{
178
215
test_dir=$dir /test_${mode}
0 commit comments