add bucketing/batchnorm and improved performance for speech_recogniti…

…on example (#6971)
apache · Jul 11, 2017 · 22f9a0d · 22f9a0d
1 parent b1cb2cd
commit 22f9a0d
Show file tree

Hide file tree

Showing 15 changed files with 685 additions and 403 deletions.
diff --git a/example/speech_recognition/README.md b/example/speech_recognition/README.md
@@ -123,3 +123,18 @@ The new file should implement two functions, prepare_data() and arch(), for buil
 
 Run the following line after preparing the files.   
 <pre><code>python main.py --configfile custom.cfg --archfile arch_custom</pre></code>
+
+***
+## **Further more**
+You can prepare full LibriSpeech dataset by following the instruction on https://github.com/baidu-research/ba-dls-deepspeech  
+**Change flac_to_wav.sh script of baidu to flac_to_wav.sh in repository to avoid bug**
+```bash
+git clone https://github.com/baidu-research/ba-dls-deepspeech
+cd ba-dls-deepspeech
+./download.sh
+cp -f /path/to/example/flac_to_wav.sh ./
+./flac_to_wav.sh
+python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/train-clean-100 train_corpus.json
+python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/dev-clean validation_corpus.json
+python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/test-clean test_corpus.json
+```
diff --git a/example/speech_recognition/arch_deepspeech.py b/example/speech_recognition/arch_deepspeech.py
diff --git a/example/speech_recognition/deepspeech.cfg b/example/speech_recognition/deepspeech.cfg
@@ -3,23 +3,27 @@
 mode = train
 #ex: gpu0,gpu1,gpu2,gpu3
 context = gpu0,gpu1,gpu2
+#context = gpu0
 # checkpoint prefix, check point will be saved under checkpoints folder with prefix
-prefix = deep
+prefix = deep_bucket
 # when mode is load or predict, model will be loaded from the file name with model_file under checkpoints
-model_file = deepspeechn_epoch1n_batch-0009
+model_file = deep_bucketn_epoch0n_batch-0018
 batch_size = 12
+#batch_size=4
 # log will be saved by the log_filename
-log_filename = deep.log
+log_filename = deep_bucket.log
 # checkpoint set n to save checkpoints after n epoch
 save_checkpoint_every_n_epoch = 1
-save_checkpoint_every_n_batch = 1000
+save_checkpoint_every_n_batch = 3000
 is_bi_graphemes = True
-tensorboard_log_dir = tblog/deep
+tensorboard_log_dir = tblog/deep_bucket
 # if random_seed is -1 then it gets random seed from timestamp
 mx_random_seed = -1
 random_seed = -1
+kvstore_option = device
 
 [data]
+max_duration = 16.0
 train_json = ./train_corpus_all.json
 test_json = ./test_corpus.json
 val_json = ./test_corpus.json
@@ -50,31 +54,49 @@ rnn_type = bigru
 #vanilla_lstm or fc_lstm (no effect when network_type is gru, bigru)
 lstm_type = fc_lstm
 is_batchnorm = True
+is_bucketing = True
+buckets = [200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600]
 
 [train]
 num_epoch = 70
 learning_rate = 0.0003
 # constant learning rate annealing by factor
 learning_rate_annealing = 1.1
-# supports only sgd and adam
-optimizer = sgd
-# for sgd
-momentum = 0.9
-# set to 0 to disable gradient clipping
-clip_gradient = 0
 initializer = Xavier
 init_scale = 2
 factor_type = in
-weight_decay = 0.
 # show progress every how nth batches
 show_every = 100
 save_optimizer_states = True
-normalize_target_k = 13000
+normalize_target_k = 100000
 # overwrite meta files(feats_mean,feats_std,unicode_en_baidu_bi_graphemes.csv)
 overwrite_meta_files = True
+overwrite_bi_graphemes_dictionary = False
+# save feature extracted from soundfile as csvfile, it can take too much disk space
+save_feature_as_csvfile = False
 enable_logging_train_metric = True
 enable_logging_validation_metric = True
 
 [load]
 load_optimizer_states = True
 is_start_from_batch = True
+
+[optimizer]
+optimizer = sgd
+# define parameters for optimizer
+# optimizer_params_dictionary should use " not ' as string wrapper
+# sgd/nag
+optimizer_params_dictionary={"momentum":0.9}
+# dcasgd
+# optimizer_params_dictionary={"momentum":0.9, "lamda":1.0}
+# adam
+# optimizer_params_dictionary={"beta1":0.9,"beta2":0.999}
+# adagrad
+# optimizer_params_dictionary={"eps":1e-08}
+# rmsprop
+# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
+# adadelta
+# optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
+# set to 0 to disable gradient clipping
+clip_gradient = 100
+weight_decay = 0.
diff --git a/example/speech_recognition/default.cfg b/example/speech_recognition/default.cfg
@@ -6,20 +6,22 @@ context = gpu0
 # checkpoint prefix, check point will be saved under checkpoints folder with prefix
 prefix = test_fc
 # when mode is load or predict, model will be loaded from the file name with model_file under checkpoints
-model_file = test_fc-0001
+model_file = test_fc-0040
 batch_size = 2
 # log will be saved by the log_filename
 log_filename = test.log
 # checkpoint set n to save checkpoints after n epoch
-save_checkpoint_every_n_epoch = 1
+save_checkpoint_every_n_epoch = 20
 save_checkpoint_every_n_batch = 1000
 is_bi_graphemes = False
 tensorboard_log_dir = tblog/libri_sample
 # if random_seed is -1 then it gets random seed from timestamp
-mx_random_seed = -1
-random_seed = -1
+mx_random_seed = 1234
+random_seed = 1234
+kvstore_option = device
 
 [data]
+max_duration = 16.0
 train_json = ./Libri_sample.json
 test_json = ./Libri_sample.json
 val_json = ./Libri_sample.json
@@ -37,8 +39,8 @@ conv_layer1_stride = [2, 2]
 conv_layer2_filter_dim = [11, 21]
 conv_layer2_stride = [1, 2]
 
-num_rnn_layer = 3
-num_hidden_rnn_list = [1760, 1760, 1760]
+num_rnn_layer = 1
+num_hidden_rnn_list = [1760]
 num_hidden_proj = 0
 
 num_rear_fc_layers = 0
@@ -50,33 +52,49 @@ rnn_type = bigru
 #vanilla_lstm or fc_lstm (no effect when network_type is gru, bigru)
 lstm_type = fc_lstm
 is_batchnorm = True
+is_bucketing = False
+buckets = []
 
 [train]
-num_epoch = 70
-
+num_epoch = 50
 learning_rate = 0.005
 # constant learning rate annealing by factor
 learning_rate_annealing = 1.1
-# supports only sgd and adam
-optimizer = adam
-# for sgd
-momentum = 0.9
-# set to 0 to disable gradient clipping
-clip_gradient = 0
-
 initializer = Xavier
 init_scale = 2
 factor_type = in
-weight_decay = 0.00001
 # show progress every nth batches
 show_every = 1
 save_optimizer_states = True
 normalize_target_k = 2
 # overwrite meta files(feats_mean,feats_std,unicode_en_baidu_bi_graphemes.csv)
 overwrite_meta_files = True
+overwrite_bi_graphemes_dictionary = False
+# save feature extracted from soundfile as csvfile, it can take too much disk space
+save_feature_as_csvfile = False
 enable_logging_train_metric = True
 enable_logging_validation_metric = True
 
 [load]
 load_optimizer_states = True
 is_start_from_batch = False
+
+[optimizer]
+optimizer = adam
+# define parameters for optimizer
+# optimizer_params_dictionary should use " not ' as string wrapper
+# sgd/nag
+# optimizer_params_dictionary={"momentum":0.9}
+# dcasgd
+# optimizer_params_dictionary={"momentum":0.9, "lamda":1.0}
+# adam
+optimizer_params_dictionary={"beta1":0.9,"beta2":0.999}
+# adagrad
+# optimizer_params_dictionary={"eps":1e-08}
+# rmsprop
+# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
+# adadelta
+# optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
+# set to 0 to disable gradient clipping
+clip_gradient = 0
+weight_decay = 0.