Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
add bucketing/batchnorm and improved performance for speech_recogniti…
Browse files Browse the repository at this point in the history
…on example (#6971)
  • Loading branch information
Soonhwan-Kwon authored and piiswrong committed Jul 11, 2017
1 parent b1cb2cd commit 22f9a0d
Show file tree
Hide file tree
Showing 15 changed files with 685 additions and 403 deletions.
15 changes: 15 additions & 0 deletions example/speech_recognition/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,18 @@ The new file should implement two functions, prepare_data() and arch(), for buil

Run the following line after preparing the files.
<pre><code>python main.py --configfile custom.cfg --archfile arch_custom</pre></code>

***
## **Further more**
You can prepare full LibriSpeech dataset by following the instruction on https://github.com/baidu-research/ba-dls-deepspeech
**Change flac_to_wav.sh script of baidu to flac_to_wav.sh in repository to avoid bug**
```bash
git clone https://github.com/baidu-research/ba-dls-deepspeech
cd ba-dls-deepspeech
./download.sh
cp -f /path/to/example/flac_to_wav.sh ./
./flac_to_wav.sh
python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/train-clean-100 train_corpus.json
python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/dev-clean validation_corpus.json
python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/test-clean test_corpus.json
```
284 changes: 162 additions & 122 deletions example/speech_recognition/arch_deepspeech.py

Large diffs are not rendered by default.

48 changes: 35 additions & 13 deletions example/speech_recognition/deepspeech.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,27 @@
mode = train
#ex: gpu0,gpu1,gpu2,gpu3
context = gpu0,gpu1,gpu2
#context = gpu0
# checkpoint prefix, check point will be saved under checkpoints folder with prefix
prefix = deep
prefix = deep_bucket
# when mode is load or predict, model will be loaded from the file name with model_file under checkpoints
model_file = deepspeechn_epoch1n_batch-0009
model_file = deep_bucketn_epoch0n_batch-0018
batch_size = 12
#batch_size=4
# log will be saved by the log_filename
log_filename = deep.log
log_filename = deep_bucket.log
# checkpoint set n to save checkpoints after n epoch
save_checkpoint_every_n_epoch = 1
save_checkpoint_every_n_batch = 1000
save_checkpoint_every_n_batch = 3000
is_bi_graphemes = True
tensorboard_log_dir = tblog/deep
tensorboard_log_dir = tblog/deep_bucket
# if random_seed is -1 then it gets random seed from timestamp
mx_random_seed = -1
random_seed = -1
kvstore_option = device

[data]
max_duration = 16.0
train_json = ./train_corpus_all.json
test_json = ./test_corpus.json
val_json = ./test_corpus.json
Expand Down Expand Up @@ -50,31 +54,49 @@ rnn_type = bigru
#vanilla_lstm or fc_lstm (no effect when network_type is gru, bigru)
lstm_type = fc_lstm
is_batchnorm = True
is_bucketing = True
buckets = [200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600]

[train]
num_epoch = 70
learning_rate = 0.0003
# constant learning rate annealing by factor
learning_rate_annealing = 1.1
# supports only sgd and adam
optimizer = sgd
# for sgd
momentum = 0.9
# set to 0 to disable gradient clipping
clip_gradient = 0
initializer = Xavier
init_scale = 2
factor_type = in
weight_decay = 0.
# show progress every how nth batches
show_every = 100
save_optimizer_states = True
normalize_target_k = 13000
normalize_target_k = 100000
# overwrite meta files(feats_mean,feats_std,unicode_en_baidu_bi_graphemes.csv)
overwrite_meta_files = True
overwrite_bi_graphemes_dictionary = False
# save feature extracted from soundfile as csvfile, it can take too much disk space
save_feature_as_csvfile = False
enable_logging_train_metric = True
enable_logging_validation_metric = True

[load]
load_optimizer_states = True
is_start_from_batch = True

[optimizer]
optimizer = sgd
# define parameters for optimizer
# optimizer_params_dictionary should use " not ' as string wrapper
# sgd/nag
optimizer_params_dictionary={"momentum":0.9}
# dcasgd
# optimizer_params_dictionary={"momentum":0.9, "lamda":1.0}
# adam
# optimizer_params_dictionary={"beta1":0.9,"beta2":0.999}
# adagrad
# optimizer_params_dictionary={"eps":1e-08}
# rmsprop
# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
# adadelta
# optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
# set to 0 to disable gradient clipping
clip_gradient = 100
weight_decay = 0.
50 changes: 34 additions & 16 deletions example/speech_recognition/default.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,22 @@ context = gpu0
# checkpoint prefix, check point will be saved under checkpoints folder with prefix
prefix = test_fc
# when mode is load or predict, model will be loaded from the file name with model_file under checkpoints
model_file = test_fc-0001
model_file = test_fc-0040
batch_size = 2
# log will be saved by the log_filename
log_filename = test.log
# checkpoint set n to save checkpoints after n epoch
save_checkpoint_every_n_epoch = 1
save_checkpoint_every_n_epoch = 20
save_checkpoint_every_n_batch = 1000
is_bi_graphemes = False
tensorboard_log_dir = tblog/libri_sample
# if random_seed is -1 then it gets random seed from timestamp
mx_random_seed = -1
random_seed = -1
mx_random_seed = 1234
random_seed = 1234
kvstore_option = device

[data]
max_duration = 16.0
train_json = ./Libri_sample.json
test_json = ./Libri_sample.json
val_json = ./Libri_sample.json
Expand All @@ -37,8 +39,8 @@ conv_layer1_stride = [2, 2]
conv_layer2_filter_dim = [11, 21]
conv_layer2_stride = [1, 2]

num_rnn_layer = 3
num_hidden_rnn_list = [1760, 1760, 1760]
num_rnn_layer = 1
num_hidden_rnn_list = [1760]
num_hidden_proj = 0

num_rear_fc_layers = 0
Expand All @@ -50,33 +52,49 @@ rnn_type = bigru
#vanilla_lstm or fc_lstm (no effect when network_type is gru, bigru)
lstm_type = fc_lstm
is_batchnorm = True
is_bucketing = False
buckets = []

[train]
num_epoch = 70

num_epoch = 50
learning_rate = 0.005
# constant learning rate annealing by factor
learning_rate_annealing = 1.1
# supports only sgd and adam
optimizer = adam
# for sgd
momentum = 0.9
# set to 0 to disable gradient clipping
clip_gradient = 0

initializer = Xavier
init_scale = 2
factor_type = in
weight_decay = 0.00001
# show progress every nth batches
show_every = 1
save_optimizer_states = True
normalize_target_k = 2
# overwrite meta files(feats_mean,feats_std,unicode_en_baidu_bi_graphemes.csv)
overwrite_meta_files = True
overwrite_bi_graphemes_dictionary = False
# save feature extracted from soundfile as csvfile, it can take too much disk space
save_feature_as_csvfile = False
enable_logging_train_metric = True
enable_logging_validation_metric = True

[load]
load_optimizer_states = True
is_start_from_batch = False

[optimizer]
optimizer = adam
# define parameters for optimizer
# optimizer_params_dictionary should use " not ' as string wrapper
# sgd/nag
# optimizer_params_dictionary={"momentum":0.9}
# dcasgd
# optimizer_params_dictionary={"momentum":0.9, "lamda":1.0}
# adam
optimizer_params_dictionary={"beta1":0.9,"beta2":0.999}
# adagrad
# optimizer_params_dictionary={"eps":1e-08}
# rmsprop
# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
# adadelta
# optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
# set to 0 to disable gradient clipping
clip_gradient = 0
weight_decay = 0.
Loading

0 comments on commit 22f9a0d

Please sign in to comment.