Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[example]add bucketing/batchnorm and improved performance for speech_recognition example #6971

Merged
merged 2 commits into from
Jul 11, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions example/speech_recognition/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,18 @@ The new file should implement two functions, prepare_data() and arch(), for buil

Run the following line after preparing the files.
<pre><code>python main.py --configfile custom.cfg --archfile arch_custom</pre></code>

***
## **Further more**
You can prepare full LibriSpeech dataset by following the instruction on https://github.com/baidu-research/ba-dls-deepspeech
**Change flac_to_wav.sh script of baidu to flac_to_wav.sh in repository to avoid bug**
```bash
git clone https://github.com/baidu-research/ba-dls-deepspeech
cd ba-dls-deepspeech
./download.sh
cp -f /path/to/example/flac_to_wav.sh ./
./flac_to_wav.sh
python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/train-clean-100 train_corpus.json
python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/dev-clean validation_corpus.json
python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/test-clean test_corpus.json
```
284 changes: 162 additions & 122 deletions example/speech_recognition/arch_deepspeech.py

Large diffs are not rendered by default.

48 changes: 35 additions & 13 deletions example/speech_recognition/deepspeech.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,27 @@
mode = train
#ex: gpu0,gpu1,gpu2,gpu3
context = gpu0,gpu1,gpu2
#context = gpu0
# checkpoint prefix, check point will be saved under checkpoints folder with prefix
prefix = deep
prefix = deep_bucket
# when mode is load or predict, model will be loaded from the file name with model_file under checkpoints
model_file = deepspeechn_epoch1n_batch-0009
model_file = deep_bucketn_epoch0n_batch-0018
batch_size = 12
#batch_size=4
# log will be saved by the log_filename
log_filename = deep.log
log_filename = deep_bucket.log
# checkpoint set n to save checkpoints after n epoch
save_checkpoint_every_n_epoch = 1
save_checkpoint_every_n_batch = 1000
save_checkpoint_every_n_batch = 3000
is_bi_graphemes = True
tensorboard_log_dir = tblog/deep
tensorboard_log_dir = tblog/deep_bucket
# if random_seed is -1 then it gets random seed from timestamp
mx_random_seed = -1
random_seed = -1
kvstore_option = device

[data]
max_duration = 16.0
train_json = ./train_corpus_all.json
test_json = ./test_corpus.json
val_json = ./test_corpus.json
Expand Down Expand Up @@ -50,31 +54,49 @@ rnn_type = bigru
#vanilla_lstm or fc_lstm (no effect when network_type is gru, bigru)
lstm_type = fc_lstm
is_batchnorm = True
is_bucketing = True
buckets = [200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600]

[train]
num_epoch = 70
learning_rate = 0.0003
# constant learning rate annealing by factor
learning_rate_annealing = 1.1
# supports only sgd and adam
optimizer = sgd
# for sgd
momentum = 0.9
# set to 0 to disable gradient clipping
clip_gradient = 0
initializer = Xavier
init_scale = 2
factor_type = in
weight_decay = 0.
# show progress every how nth batches
show_every = 100
save_optimizer_states = True
normalize_target_k = 13000
normalize_target_k = 100000
# overwrite meta files(feats_mean,feats_std,unicode_en_baidu_bi_graphemes.csv)
overwrite_meta_files = True
overwrite_bi_graphemes_dictionary = False
# save feature extracted from soundfile as csvfile, it can take too much disk space
save_feature_as_csvfile = False
enable_logging_train_metric = True
enable_logging_validation_metric = True

[load]
load_optimizer_states = True
is_start_from_batch = True

[optimizer]
optimizer = sgd
# define parameters for optimizer
# optimizer_params_dictionary should use " not ' as string wrapper
# sgd/nag
optimizer_params_dictionary={"momentum":0.9}
# dcasgd
# optimizer_params_dictionary={"momentum":0.9, "lamda":1.0}
# adam
# optimizer_params_dictionary={"beta1":0.9,"beta2":0.999}
# adagrad
# optimizer_params_dictionary={"eps":1e-08}
# rmsprop
# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
# adadelta
# optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
# set to 0 to disable gradient clipping
clip_gradient = 100
weight_decay = 0.
50 changes: 34 additions & 16 deletions example/speech_recognition/default.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,22 @@ context = gpu0
# checkpoint prefix, check point will be saved under checkpoints folder with prefix
prefix = test_fc
# when mode is load or predict, model will be loaded from the file name with model_file under checkpoints
model_file = test_fc-0001
model_file = test_fc-0040
batch_size = 2
# log will be saved by the log_filename
log_filename = test.log
# checkpoint set n to save checkpoints after n epoch
save_checkpoint_every_n_epoch = 1
save_checkpoint_every_n_epoch = 20
save_checkpoint_every_n_batch = 1000
is_bi_graphemes = False
tensorboard_log_dir = tblog/libri_sample
# if random_seed is -1 then it gets random seed from timestamp
mx_random_seed = -1
random_seed = -1
mx_random_seed = 1234
random_seed = 1234
kvstore_option = device

[data]
max_duration = 16.0
train_json = ./Libri_sample.json
test_json = ./Libri_sample.json
val_json = ./Libri_sample.json
Expand All @@ -37,8 +39,8 @@ conv_layer1_stride = [2, 2]
conv_layer2_filter_dim = [11, 21]
conv_layer2_stride = [1, 2]

num_rnn_layer = 3
num_hidden_rnn_list = [1760, 1760, 1760]
num_rnn_layer = 1
num_hidden_rnn_list = [1760]
num_hidden_proj = 0

num_rear_fc_layers = 0
Expand All @@ -50,33 +52,49 @@ rnn_type = bigru
#vanilla_lstm or fc_lstm (no effect when network_type is gru, bigru)
lstm_type = fc_lstm
is_batchnorm = True
is_bucketing = False
buckets = []

[train]
num_epoch = 70

num_epoch = 50
learning_rate = 0.005
# constant learning rate annealing by factor
learning_rate_annealing = 1.1
# supports only sgd and adam
optimizer = adam
# for sgd
momentum = 0.9
# set to 0 to disable gradient clipping
clip_gradient = 0

initializer = Xavier
init_scale = 2
factor_type = in
weight_decay = 0.00001
# show progress every nth batches
show_every = 1
save_optimizer_states = True
normalize_target_k = 2
# overwrite meta files(feats_mean,feats_std,unicode_en_baidu_bi_graphemes.csv)
overwrite_meta_files = True
overwrite_bi_graphemes_dictionary = False
# save feature extracted from soundfile as csvfile, it can take too much disk space
save_feature_as_csvfile = False
enable_logging_train_metric = True
enable_logging_validation_metric = True

[load]
load_optimizer_states = True
is_start_from_batch = False

[optimizer]
optimizer = adam
# define parameters for optimizer
# optimizer_params_dictionary should use " not ' as string wrapper
# sgd/nag
# optimizer_params_dictionary={"momentum":0.9}
# dcasgd
# optimizer_params_dictionary={"momentum":0.9, "lamda":1.0}
# adam
optimizer_params_dictionary={"beta1":0.9,"beta2":0.999}
# adagrad
# optimizer_params_dictionary={"eps":1e-08}
# rmsprop
# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
# adadelta
# optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
# set to 0 to disable gradient clipping
clip_gradient = 0
weight_decay = 0.
Loading