diff --git a/example/speech_recognition/README.md b/example/speech_recognition/README.md index 69961b1bdc5c..00d166602403 100644 --- a/example/speech_recognition/README.md +++ b/example/speech_recognition/README.md @@ -123,3 +123,18 @@ The new file should implement two functions, prepare_data() and arch(), for buil Run the following line after preparing the files.
python main.py --configfile custom.cfg --archfile arch_custom
+
+***
+## **Further more**
+You can prepare full LibriSpeech dataset by following the instruction on https://github.com/baidu-research/ba-dls-deepspeech
+**Change flac_to_wav.sh script of baidu to flac_to_wav.sh in repository to avoid bug**
+```bash
+git clone https://github.com/baidu-research/ba-dls-deepspeech
+cd ba-dls-deepspeech
+./download.sh
+cp -f /path/to/example/flac_to_wav.sh ./
+./flac_to_wav.sh
+python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/train-clean-100 train_corpus.json
+python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/dev-clean validation_corpus.json
+python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/test-clean test_corpus.json
+```
diff --git a/example/speech_recognition/arch_deepspeech.py b/example/speech_recognition/arch_deepspeech.py
index 92f1002a2f01..4288b246f3e5 100644
--- a/example/speech_recognition/arch_deepspeech.py
+++ b/example/speech_recognition/arch_deepspeech.py
@@ -1,6 +1,12 @@
+# pylint: disable=C0111, too-many-statements, too-many-locals
+# pylint: too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme
+# pylint: disable=superfluous-parens, no-member, invalid-name
+"""
+architecture file for deep speech 2 model
+"""
import json
import math
-
+import argparse
import mxnet as mx
from stt_layer_batchnorm import batchnorm
@@ -13,6 +19,9 @@
def prepare_data(args):
+ """
+ set atual shape of data
+ """
rnn_type = args.config.get("arch", "rnn_type")
num_rnn_layer = args.config.getint("arch", "num_rnn_layer")
num_hidden_rnn_list = json.loads(args.config.get("arch", "num_hidden_rnn_list"))
@@ -20,26 +29,29 @@ def prepare_data(args):
batch_size = args.config.getint("common", "batch_size")
if rnn_type == 'lstm':
- init_c = [('l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l])) for l in range(num_rnn_layer)]
- init_h = [('l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in range(num_rnn_layer)]
+ init_c = [('l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l]))
+ for l in range(num_rnn_layer)]
+ init_h = [('l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l]))
+ for l in range(num_rnn_layer)]
elif rnn_type == 'bilstm':
- forward_init_c = [('forward_l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l])) for l in
- range(num_rnn_layer)]
- backward_init_c = [('backward_l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l])) for l in
- range(num_rnn_layer)]
+ forward_init_c = [('forward_l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l]))
+ for l in range(num_rnn_layer)]
+ backward_init_c = [('backward_l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l]))
+ for l in range(num_rnn_layer)]
init_c = forward_init_c + backward_init_c
- forward_init_h = [('forward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in
- range(num_rnn_layer)]
- backward_init_h = [('backward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in
- range(num_rnn_layer)]
+ forward_init_h = [('forward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l]))
+ for l in range(num_rnn_layer)]
+ backward_init_h = [('backward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l]))
+ for l in range(num_rnn_layer)]
init_h = forward_init_h + backward_init_h
elif rnn_type == 'gru':
- init_h = [('l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in range(num_rnn_layer)]
+ init_h = [('l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l]))
+ for l in range(num_rnn_layer)]
elif rnn_type == 'bigru':
- forward_init_h = [('forward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in
- range(num_rnn_layer)]
- backward_init_h = [('backward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in
- range(num_rnn_layer)]
+ forward_init_h = [('forward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l]))
+ for l in range(num_rnn_layer)]
+ backward_init_h = [('backward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l]))
+ for l in range(num_rnn_layer)]
init_h = forward_init_h + backward_init_h
else:
raise Exception('network type should be one of the lstm,bilstm,gru,bigru')
@@ -51,115 +63,143 @@ def prepare_data(args):
return init_states
-def arch(args):
- mode = args.config.get("common", "mode")
- if mode == "train":
- channel_num = args.config.getint("arch", "channel_num")
- conv_layer1_filter_dim = tuple(json.loads(args.config.get("arch", "conv_layer1_filter_dim")))
- conv_layer1_stride = tuple(json.loads(args.config.get("arch", "conv_layer1_stride")))
- conv_layer2_filter_dim = tuple(json.loads(args.config.get("arch", "conv_layer2_filter_dim")))
- conv_layer2_stride = tuple(json.loads(args.config.get("arch", "conv_layer2_stride")))
-
- rnn_type = args.config.get("arch", "rnn_type")
- num_rnn_layer = args.config.getint("arch", "num_rnn_layer")
- num_hidden_rnn_list = json.loads(args.config.get("arch", "num_hidden_rnn_list"))
-
- is_batchnorm = args.config.getboolean("arch", "is_batchnorm")
-
- seq_len = args.config.getint('arch', 'max_t_count')
- num_label = args.config.getint('arch', 'max_label_length')
-
- num_rear_fc_layers = args.config.getint("arch", "num_rear_fc_layers")
- num_hidden_rear_fc_list = json.loads(args.config.get("arch", "num_hidden_rear_fc_list"))
- act_type_rear_fc_list = json.loads(args.config.get("arch", "act_type_rear_fc_list"))
- # model symbol generation
- # input preparation
- data = mx.sym.Variable('data')
- label = mx.sym.Variable('label')
-
- net = mx.sym.Reshape(data=data, shape=(-4, -1, 1, 0, 0))
- net = conv(net=net,
- channels=channel_num,
- filter_dimension=conv_layer1_filter_dim,
- stride=conv_layer1_stride,
- no_bias=is_batchnorm
- )
- if is_batchnorm:
- # batch norm normalizes axis 1
- net = batchnorm(net)
-
- net = conv(net=net,
- channels=channel_num,
- filter_dimension=conv_layer2_filter_dim,
- stride=conv_layer2_stride,
- no_bias=is_batchnorm
- )
- if is_batchnorm:
- # batch norm normalizes axis 1
- net = batchnorm(net)
- net = mx.sym.transpose(data=net, axes=(0, 2, 1, 3))
- net = mx.sym.Reshape(data=net, shape=(0, 0, -3))
- seq_len_after_conv_layer1 = int(
- math.floor((seq_len - conv_layer1_filter_dim[0]) / conv_layer1_stride[0])) + 1
- seq_len_after_conv_layer2 = int(
- math.floor((seq_len_after_conv_layer1 - conv_layer2_filter_dim[0]) / conv_layer2_stride[0])) + 1
- net = slice_symbol_to_seq_symobls(net=net, seq_len=seq_len_after_conv_layer2, axis=1)
- if rnn_type == "bilstm":
- net = bi_lstm_unroll(net=net,
+def arch(args, seq_len=None):
+ """
+ define deep speech 2 network
+ """
+ if isinstance(args, argparse.Namespace):
+ mode = args.config.get("common", "mode")
+ is_bucketing = args.config.getboolean("arch", "is_bucketing")
+ if mode == "train" or is_bucketing:
+ channel_num = args.config.getint("arch", "channel_num")
+ conv_layer1_filter_dim = \
+ tuple(json.loads(args.config.get("arch", "conv_layer1_filter_dim")))
+ conv_layer1_stride = tuple(json.loads(args.config.get("arch", "conv_layer1_stride")))
+ conv_layer2_filter_dim = \
+ tuple(json.loads(args.config.get("arch", "conv_layer2_filter_dim")))
+ conv_layer2_stride = tuple(json.loads(args.config.get("arch", "conv_layer2_stride")))
+
+ rnn_type = args.config.get("arch", "rnn_type")
+ num_rnn_layer = args.config.getint("arch", "num_rnn_layer")
+ num_hidden_rnn_list = json.loads(args.config.get("arch", "num_hidden_rnn_list"))
+
+ is_batchnorm = args.config.getboolean("arch", "is_batchnorm")
+
+ if seq_len is None:
+ seq_len = args.config.getint('arch', 'max_t_count')
+
+ num_label = args.config.getint('arch', 'max_label_length')
+
+ num_rear_fc_layers = args.config.getint("arch", "num_rear_fc_layers")
+ num_hidden_rear_fc_list = json.loads(args.config.get("arch", "num_hidden_rear_fc_list"))
+ act_type_rear_fc_list = json.loads(args.config.get("arch", "act_type_rear_fc_list"))
+ # model symbol generation
+ # input preparation
+ data = mx.sym.Variable('data')
+ label = mx.sym.Variable('label')
+
+ net = mx.sym.Reshape(data=data, shape=(-4, -1, 1, 0, 0))
+ net = conv(net=net,
+ channels=channel_num,
+ filter_dimension=conv_layer1_filter_dim,
+ stride=conv_layer1_stride,
+ no_bias=is_batchnorm,
+ name='conv1')
+ if is_batchnorm:
+ # batch norm normalizes axis 1
+ net = batchnorm(net, name="conv1_batchnorm")
+
+ net = conv(net=net,
+ channels=channel_num,
+ filter_dimension=conv_layer2_filter_dim,
+ stride=conv_layer2_stride,
+ no_bias=is_batchnorm,
+ name='conv2')
+ if is_batchnorm:
+ # batch norm normalizes axis 1
+ net = batchnorm(net, name="conv2_batchnorm")
+
+ net = mx.sym.transpose(data=net, axes=(0, 2, 1, 3))
+ net = mx.sym.Reshape(data=net, shape=(0, 0, -3))
+ seq_len_after_conv_layer1 = int(
+ math.floor((seq_len - conv_layer1_filter_dim[0]) / conv_layer1_stride[0])) + 1
+ seq_len_after_conv_layer2 = int(
+ math.floor((seq_len_after_conv_layer1 - conv_layer2_filter_dim[0])
+ / conv_layer2_stride[0])) + 1
+ net = slice_symbol_to_seq_symobls(net=net, seq_len=seq_len_after_conv_layer2, axis=1)
+ if rnn_type == "bilstm":
+ net = bi_lstm_unroll(net=net,
+ seq_len=seq_len_after_conv_layer2,
+ num_hidden_lstm_list=num_hidden_rnn_list,
+ num_lstm_layer=num_rnn_layer,
+ dropout=0.,
+ is_batchnorm=is_batchnorm,
+ is_bucketing=is_bucketing)
+ elif rnn_type == "gru":
+ net = gru_unroll(net=net,
seq_len=seq_len_after_conv_layer2,
- num_hidden_lstm_list=num_hidden_rnn_list,
- num_lstm_layer=num_rnn_layer,
+ num_hidden_gru_list=num_hidden_rnn_list,
+ num_gru_layer=num_rnn_layer,
dropout=0.,
- is_batchnorm=is_batchnorm)
- elif rnn_type == "gru":
- net = gru_unroll(net=net,
- seq_len=seq_len_after_conv_layer2,
- num_hidden_gru_list=num_hidden_rnn_list,
- num_gru_layer=num_rnn_layer,
- dropout=0.,
- is_batchnorm=is_batchnorm)
- elif rnn_type == "bigru":
- net = bi_gru_unroll(net=net,
+ is_batchnorm=is_batchnorm,
+ is_bucketing=is_bucketing)
+ elif rnn_type == "bigru":
+ net = bi_gru_unroll(net=net,
+ seq_len=seq_len_after_conv_layer2,
+ num_hidden_gru_list=num_hidden_rnn_list,
+ num_gru_layer=num_rnn_layer,
+ dropout=0.,
+ is_batchnorm=is_batchnorm,
+ is_bucketing=is_bucketing)
+ else:
+ raise Exception('rnn_type should be one of the followings, bilstm,gru,bigru')
+
+ # rear fc layers
+ net = sequence_fc(net=net, seq_len=seq_len_after_conv_layer2,
+ num_layer=num_rear_fc_layers, prefix="rear",
+ num_hidden_list=num_hidden_rear_fc_list,
+ act_type_list=act_type_rear_fc_list,
+ is_batchnorm=is_batchnorm)
+ # warpctc layer
+ net = warpctc_layer(net=net,
seq_len=seq_len_after_conv_layer2,
- num_hidden_gru_list=num_hidden_rnn_list,
- num_gru_layer=num_rnn_layer,
- dropout=0.,
- is_batchnorm=is_batchnorm)
+ label=label,
+ num_label=num_label,
+ character_classes_count=
+ (args.config.getint('arch', 'n_classes') + 1))
+ args.config.set('arch', 'max_t_count', str(seq_len_after_conv_layer2))
+ return net
+ elif mode == 'load' or mode == 'predict':
+ conv_layer1_filter_dim = \
+ tuple(json.loads(args.config.get("arch", "conv_layer1_filter_dim")))
+ conv_layer1_stride = tuple(json.loads(args.config.get("arch", "conv_layer1_stride")))
+ conv_layer2_filter_dim = \
+ tuple(json.loads(args.config.get("arch", "conv_layer2_filter_dim")))
+ conv_layer2_stride = tuple(json.loads(args.config.get("arch", "conv_layer2_stride")))
+ if seq_len is None:
+ seq_len = args.config.getint('arch', 'max_t_count')
+ seq_len_after_conv_layer1 = int(
+ math.floor((seq_len - conv_layer1_filter_dim[0]) / conv_layer1_stride[0])) + 1
+ seq_len_after_conv_layer2 = int(
+ math.floor((seq_len_after_conv_layer1 - conv_layer2_filter_dim[0])
+ / conv_layer2_stride[0])) + 1
+
+ args.config.set('arch', 'max_t_count', str(seq_len_after_conv_layer2))
else:
- raise Exception('rnn_type should be one of the followings, bilstm,gru,bigru')
-
- # rear fc layers
- net = sequence_fc(net=net, seq_len=seq_len_after_conv_layer2, num_layer=num_rear_fc_layers, prefix="rear",
- num_hidden_list=num_hidden_rear_fc_list, act_type_list=act_type_rear_fc_list,
- is_batchnorm=is_batchnorm)
- if is_batchnorm:
- hidden_all = []
- # batch norm normalizes axis 1
- for seq_index in range(seq_len_after_conv_layer2):
- hidden = net[seq_index]
- hidden = batchnorm(hidden)
- hidden_all.append(hidden)
- net = hidden_all
-
- # warpctc layer
- net = warpctc_layer(net=net,
- seq_len=seq_len_after_conv_layer2,
- label=label,
- num_label=num_label,
- character_classes_count=(args.config.getint('arch', 'n_classes') + 1)
- )
- args.config.set('arch', 'max_t_count', str(seq_len_after_conv_layer2))
- return net
- else:
- conv_layer1_filter_dim = tuple(json.loads(args.config.get("arch", "conv_layer1_filter_dim")))
- conv_layer1_stride = tuple(json.loads(args.config.get("arch", "conv_layer1_stride")))
- conv_layer2_filter_dim = tuple(json.loads(args.config.get("arch", "conv_layer2_filter_dim")))
- conv_layer2_stride = tuple(json.loads(args.config.get("arch", "conv_layer2_stride")))
- seq_len = args.config.getint('arch', 'max_t_count')
- seq_len_after_conv_layer1 = int(
- math.floor((seq_len - conv_layer1_filter_dim[0]) / conv_layer1_stride[0])) + 1
- seq_len_after_conv_layer2 = int(
- math.floor((seq_len_after_conv_layer1 - conv_layer2_filter_dim[0]) / conv_layer2_stride[0])) + 1
- args.config.set('arch', 'max_t_count', str(seq_len_after_conv_layer2))
+ raise Exception('mode must be the one of the followings - train,predict,load')
+
+
+class BucketingArch(object):
+ def __init__(self, args):
+ self.args = args
+ def sym_gen(self, seq_len):
+ args = self.args
+ net = arch(args, seq_len)
+ init_states = prepare_data(args)
+ init_state_names = [x[0] for x in init_states]
+ init_state_names.insert(0, 'data')
+ return net, init_state_names, ('label',)
+ def get_sym_gen(self):
+ return self.sym_gen
diff --git a/example/speech_recognition/deepspeech.cfg b/example/speech_recognition/deepspeech.cfg
index 13cf578c679a..4f0f49699771 100644
--- a/example/speech_recognition/deepspeech.cfg
+++ b/example/speech_recognition/deepspeech.cfg
@@ -3,23 +3,27 @@
mode = train
#ex: gpu0,gpu1,gpu2,gpu3
context = gpu0,gpu1,gpu2
+#context = gpu0
# checkpoint prefix, check point will be saved under checkpoints folder with prefix
-prefix = deep
+prefix = deep_bucket
# when mode is load or predict, model will be loaded from the file name with model_file under checkpoints
-model_file = deepspeechn_epoch1n_batch-0009
+model_file = deep_bucketn_epoch0n_batch-0018
batch_size = 12
+#batch_size=4
# log will be saved by the log_filename
-log_filename = deep.log
+log_filename = deep_bucket.log
# checkpoint set n to save checkpoints after n epoch
save_checkpoint_every_n_epoch = 1
-save_checkpoint_every_n_batch = 1000
+save_checkpoint_every_n_batch = 3000
is_bi_graphemes = True
-tensorboard_log_dir = tblog/deep
+tensorboard_log_dir = tblog/deep_bucket
# if random_seed is -1 then it gets random seed from timestamp
mx_random_seed = -1
random_seed = -1
+kvstore_option = device
[data]
+max_duration = 16.0
train_json = ./train_corpus_all.json
test_json = ./test_corpus.json
val_json = ./test_corpus.json
@@ -50,31 +54,49 @@ rnn_type = bigru
#vanilla_lstm or fc_lstm (no effect when network_type is gru, bigru)
lstm_type = fc_lstm
is_batchnorm = True
+is_bucketing = True
+buckets = [200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600]
[train]
num_epoch = 70
learning_rate = 0.0003
# constant learning rate annealing by factor
learning_rate_annealing = 1.1
-# supports only sgd and adam
-optimizer = sgd
-# for sgd
-momentum = 0.9
-# set to 0 to disable gradient clipping
-clip_gradient = 0
initializer = Xavier
init_scale = 2
factor_type = in
-weight_decay = 0.
# show progress every how nth batches
show_every = 100
save_optimizer_states = True
-normalize_target_k = 13000
+normalize_target_k = 100000
# overwrite meta files(feats_mean,feats_std,unicode_en_baidu_bi_graphemes.csv)
overwrite_meta_files = True
+overwrite_bi_graphemes_dictionary = False
+# save feature extracted from soundfile as csvfile, it can take too much disk space
+save_feature_as_csvfile = False
enable_logging_train_metric = True
enable_logging_validation_metric = True
[load]
load_optimizer_states = True
is_start_from_batch = True
+
+[optimizer]
+optimizer = sgd
+# define parameters for optimizer
+# optimizer_params_dictionary should use " not ' as string wrapper
+# sgd/nag
+optimizer_params_dictionary={"momentum":0.9}
+# dcasgd
+# optimizer_params_dictionary={"momentum":0.9, "lamda":1.0}
+# adam
+# optimizer_params_dictionary={"beta1":0.9,"beta2":0.999}
+# adagrad
+# optimizer_params_dictionary={"eps":1e-08}
+# rmsprop
+# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
+# adadelta
+# optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
+# set to 0 to disable gradient clipping
+clip_gradient = 100
+weight_decay = 0.
diff --git a/example/speech_recognition/default.cfg b/example/speech_recognition/default.cfg
index 853a04aebbdd..127c492b6166 100644
--- a/example/speech_recognition/default.cfg
+++ b/example/speech_recognition/default.cfg
@@ -6,20 +6,22 @@ context = gpu0
# checkpoint prefix, check point will be saved under checkpoints folder with prefix
prefix = test_fc
# when mode is load or predict, model will be loaded from the file name with model_file under checkpoints
-model_file = test_fc-0001
+model_file = test_fc-0040
batch_size = 2
# log will be saved by the log_filename
log_filename = test.log
# checkpoint set n to save checkpoints after n epoch
-save_checkpoint_every_n_epoch = 1
+save_checkpoint_every_n_epoch = 20
save_checkpoint_every_n_batch = 1000
is_bi_graphemes = False
tensorboard_log_dir = tblog/libri_sample
# if random_seed is -1 then it gets random seed from timestamp
-mx_random_seed = -1
-random_seed = -1
+mx_random_seed = 1234
+random_seed = 1234
+kvstore_option = device
[data]
+max_duration = 16.0
train_json = ./Libri_sample.json
test_json = ./Libri_sample.json
val_json = ./Libri_sample.json
@@ -37,8 +39,8 @@ conv_layer1_stride = [2, 2]
conv_layer2_filter_dim = [11, 21]
conv_layer2_stride = [1, 2]
-num_rnn_layer = 3
-num_hidden_rnn_list = [1760, 1760, 1760]
+num_rnn_layer = 1
+num_hidden_rnn_list = [1760]
num_hidden_proj = 0
num_rear_fc_layers = 0
@@ -50,33 +52,49 @@ rnn_type = bigru
#vanilla_lstm or fc_lstm (no effect when network_type is gru, bigru)
lstm_type = fc_lstm
is_batchnorm = True
+is_bucketing = False
+buckets = []
[train]
-num_epoch = 70
-
+num_epoch = 50
learning_rate = 0.005
# constant learning rate annealing by factor
learning_rate_annealing = 1.1
-# supports only sgd and adam
-optimizer = adam
-# for sgd
-momentum = 0.9
-# set to 0 to disable gradient clipping
-clip_gradient = 0
-
initializer = Xavier
init_scale = 2
factor_type = in
-weight_decay = 0.00001
# show progress every nth batches
show_every = 1
save_optimizer_states = True
normalize_target_k = 2
# overwrite meta files(feats_mean,feats_std,unicode_en_baidu_bi_graphemes.csv)
overwrite_meta_files = True
+overwrite_bi_graphemes_dictionary = False
+# save feature extracted from soundfile as csvfile, it can take too much disk space
+save_feature_as_csvfile = False
enable_logging_train_metric = True
enable_logging_validation_metric = True
[load]
load_optimizer_states = True
is_start_from_batch = False
+
+[optimizer]
+optimizer = adam
+# define parameters for optimizer
+# optimizer_params_dictionary should use " not ' as string wrapper
+# sgd/nag
+# optimizer_params_dictionary={"momentum":0.9}
+# dcasgd
+# optimizer_params_dictionary={"momentum":0.9, "lamda":1.0}
+# adam
+optimizer_params_dictionary={"beta1":0.9,"beta2":0.999}
+# adagrad
+# optimizer_params_dictionary={"eps":1e-08}
+# rmsprop
+# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
+# adadelta
+# optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
+# set to 0 to disable gradient clipping
+clip_gradient = 0
+weight_decay = 0.
diff --git a/example/speech_recognition/main.py b/example/speech_recognition/main.py
index 398a8a537e01..a425e0a8ab40 100644
--- a/example/speech_recognition/main.py
+++ b/example/speech_recognition/main.py
@@ -1,34 +1,32 @@
+import json
+import os
import sys
-
-sys.path.insert(0, "../../python")
+from collections import namedtuple
+from datetime import datetime
from config_util import parse_args, parse_contexts, generate_file_path
from train import do_training
import mxnet as mx
from stt_io_iter import STTIter
from label_util import LabelUtil
from log_util import LogUtil
-
import numpy as np
from stt_datagenerator import DataGenerator
from stt_metric import STTMetric
-from datetime import datetime
from stt_bi_graphemes_util import generate_bi_graphemes_dictionary
-########################################
-########## FOR JUPYTER NOTEBOOK
-import os
+from stt_bucketing_module import STTBucketingModule
+from stt_io_bucketingiter import BucketSTTIter
+sys.path.insert(0, "../../python")
# os.environ['MXNET_ENGINE_TYPE'] = "NaiveEngine"
os.environ['MXNET_ENGINE_TYPE'] = "ThreadedEnginePerDevice"
os.environ['MXNET_ENABLE_GPU_P2P'] = "0"
-
class WHCS:
width = 0
height = 0
channel = 0
stride = 0
-
class ConfigLogger(object):
def __init__(self, log):
self.__log = log
@@ -42,9 +40,25 @@ def write(self, data):
line = data.strip()
self.__log.info(line)
+def load_labelutil(labelUtil, is_bi_graphemes, language="en"):
+ if language == "en":
+ if is_bi_graphemes:
+ try:
+ labelUtil.load_unicode_set("resources/unicodemap_en_baidu_bi_graphemes.csv")
+ except:
+ raise Exception("There is no resources/unicodemap_en_baidu_bi_graphemes.csv." +
+ " Please set overwrite_bi_graphemes_dictionary True at train section")
+ else:
+ labelUtil.load_unicode_set("resources/unicodemap_en_baidu.csv")
+ else:
+ raise Exception("Error: Language Type: %s" % language)
+
+
def load_data(args):
mode = args.config.get('common', 'mode')
+ if mode not in ['train', 'predict', 'load']:
+ raise Exception('mode must be the one of the followings - train,predict,load')
batch_size = args.config.getint('common', 'batch_size')
whcs = WHCS()
@@ -56,101 +70,78 @@ def load_data(args):
model_name = args.config.get('common', 'prefix')
is_bi_graphemes = args.config.getboolean('common', 'is_bi_graphemes')
overwrite_meta_files = args.config.getboolean('train', 'overwrite_meta_files')
+ overwrite_bi_graphemes_dictionary = args.config.getboolean('train', 'overwrite_bi_graphemes_dictionary')
+ max_duration = args.config.getfloat('data', 'max_duration')
language = args.config.get('data', 'language')
- is_bi_graphemes = args.config.getboolean('common', 'is_bi_graphemes')
+ log = LogUtil().getlogger()
labelUtil = LabelUtil.getInstance()
- if language == "en":
- if is_bi_graphemes:
- try:
- labelUtil.load_unicode_set("resources/unicodemap_en_baidu_bi_graphemes.csv")
- except:
- raise Exception("There is no resources/unicodemap_en_baidu_bi_graphemes.csv. Please set overwrite_meta_files at train section True")
- else:
- labelUtil.load_unicode_set("resources/unicodemap_en_baidu.csv")
- else:
- raise Exception("Error: Language Type: %s" % language)
- args.config.set('arch', 'n_classes', str(labelUtil.get_count()))
-
- if mode == 'predict':
- test_json = args.config.get('data', 'test_json')
- datagen = DataGenerator(save_dir=save_dir, model_name=model_name)
- datagen.load_train_data(test_json)
- datagen.get_meta_from_file(np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')),
- np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std')))
- elif mode =="train" or mode == "load":
+ if mode == "train" or mode == "load":
data_json = args.config.get('data', 'train_json')
val_json = args.config.get('data', 'val_json')
datagen = DataGenerator(save_dir=save_dir, model_name=model_name)
- datagen.load_train_data(data_json)
- #test bigramphems
-
- if overwrite_meta_files and is_bi_graphemes:
- generate_bi_graphemes_dictionary(datagen.train_texts)
-
+ datagen.load_train_data(data_json, max_duration=max_duration)
+ if is_bi_graphemes:
+ if not os.path.isfile("resources/unicodemap_en_baidu_bi_graphemes.csv") or overwrite_bi_graphemes_dictionary:
+ load_labelutil(labelUtil=labelUtil, is_bi_graphemes=False, language=language)
+ generate_bi_graphemes_dictionary(datagen.train_texts)
+ load_labelutil(labelUtil=labelUtil, is_bi_graphemes=is_bi_graphemes, language=language)
args.config.set('arch', 'n_classes', str(labelUtil.get_count()))
if mode == "train":
if overwrite_meta_files:
+ log.info("Generate mean and std from samples")
normalize_target_k = args.config.getint('train', 'normalize_target_k')
datagen.sample_normalize(normalize_target_k, True)
else:
- datagen.get_meta_from_file(np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')),
- np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std')))
- datagen.load_validation_data(val_json)
+ log.info("Read mean and std from meta files")
+ datagen.get_meta_from_file(
+ np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')),
+ np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std')))
+ datagen.load_validation_data(val_json, max_duration=max_duration)
elif mode == "load":
# get feat_mean and feat_std to normalize dataset
- datagen.get_meta_from_file(np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')),
- np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std')))
- datagen.load_validation_data(val_json)
- else:
- raise Exception(
- 'Define mode in the cfg file first. train or predict or load can be the candidate for the mode.')
+ datagen.get_meta_from_file(
+ np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')),
+ np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std')))
+ datagen.load_validation_data(val_json, max_duration=max_duration)
+ elif mode == 'predict':
+ test_json = args.config.get('data', 'test_json')
+ datagen = DataGenerator(save_dir=save_dir, model_name=model_name)
+ datagen.load_train_data(test_json, max_duration=max_duration)
+ labelutil = load_labelutil(labelUtil, is_bi_graphemes, language="en")
+ args.config.set('arch', 'n_classes', str(labelUtil.get_count()))
+ datagen.get_meta_from_file(
+ np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')),
+ np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std')))
is_batchnorm = args.config.getboolean('arch', 'is_batchnorm')
- if batch_size == 1 and is_batchnorm:
+ if batch_size == 1 and is_batchnorm and (mode == 'train' or mode == 'load'):
raise Warning('batch size 1 is too small for is_batchnorm')
# sort file paths by its duration in ascending order to implement sortaGrad
-
if mode == "train" or mode == "load":
max_t_count = datagen.get_max_seq_length(partition="train")
- max_label_length = datagen.get_max_label_length(partition="train",is_bi_graphemes=is_bi_graphemes)
+ max_label_length = \
+ datagen.get_max_label_length(partition="train", is_bi_graphemes=is_bi_graphemes)
elif mode == "predict":
max_t_count = datagen.get_max_seq_length(partition="test")
- max_label_length = datagen.get_max_label_length(partition="test",is_bi_graphemes=is_bi_graphemes)
- else:
- raise Exception(
- 'Define mode in the cfg file first. train or predict or load can be the candidate for the mode.')
+ max_label_length = \
+ datagen.get_max_label_length(partition="test", is_bi_graphemes=is_bi_graphemes)
args.config.set('arch', 'max_t_count', str(max_t_count))
args.config.set('arch', 'max_label_length', str(max_label_length))
from importlib import import_module
prepare_data_template = import_module(args.config.get('arch', 'arch_file'))
init_states = prepare_data_template.prepare_data(args)
- if mode == "train":
- sort_by_duration = True
- else:
- sort_by_duration = False
-
- data_loaded = STTIter(partition="train",
- count=datagen.count,
- datagen=datagen,
- batch_size=batch_size,
- num_label=max_label_length,
- init_states=init_states,
- seq_length=max_t_count,
- width=whcs.width,
- height=whcs.height,
- sort_by_duration=sort_by_duration,
- is_bi_graphemes=is_bi_graphemes)
-
- if mode == 'predict':
- return data_loaded, args
- else:
- validation_loaded = STTIter(partition="validation",
- count=datagen.val_count,
+ sort_by_duration = (mode == "train")
+ is_bucketing = args.config.getboolean('arch', 'is_bucketing')
+ save_feature_as_csvfile = args.config.getboolean('train', 'save_feature_as_csvfile')
+ if is_bucketing:
+ buckets = json.loads(args.config.get('arch', 'buckets'))
+ data_loaded = BucketSTTIter(partition="train",
+ count=datagen.count,
datagen=datagen,
batch_size=batch_size,
num_label=max_label_length,
@@ -158,37 +149,91 @@ def load_data(args):
seq_length=max_t_count,
width=whcs.width,
height=whcs.height,
- sort_by_duration=False,
- is_bi_graphemes=is_bi_graphemes)
+ sort_by_duration=sort_by_duration,
+ is_bi_graphemes=is_bi_graphemes,
+ buckets=buckets,
+ save_feature_as_csvfile=save_feature_as_csvfile)
+ else:
+ data_loaded = STTIter(partition="train",
+ count=datagen.count,
+ datagen=datagen,
+ batch_size=batch_size,
+ num_label=max_label_length,
+ init_states=init_states,
+ seq_length=max_t_count,
+ width=whcs.width,
+ height=whcs.height,
+ sort_by_duration=sort_by_duration,
+ is_bi_graphemes=is_bi_graphemes,
+ save_feature_as_csvfile=save_feature_as_csvfile)
+
+ if mode == 'train' or mode == 'load':
+ if is_bucketing:
+ validation_loaded = BucketSTTIter(partition="validation",
+ count=datagen.val_count,
+ datagen=datagen,
+ batch_size=batch_size,
+ num_label=max_label_length,
+ init_states=init_states,
+ seq_length=max_t_count,
+ width=whcs.width,
+ height=whcs.height,
+ sort_by_duration=False,
+ is_bi_graphemes=is_bi_graphemes,
+ buckets=buckets,
+ save_feature_as_csvfile=save_feature_as_csvfile)
+ else:
+ validation_loaded = STTIter(partition="validation",
+ count=datagen.val_count,
+ datagen=datagen,
+ batch_size=batch_size,
+ num_label=max_label_length,
+ init_states=init_states,
+ seq_length=max_t_count,
+ width=whcs.width,
+ height=whcs.height,
+ sort_by_duration=False,
+ is_bi_graphemes=is_bi_graphemes,
+ save_feature_as_csvfile=save_feature_as_csvfile)
return data_loaded, validation_loaded, args
+ elif mode == 'predict':
+ return data_loaded, args
def load_model(args, contexts, data_train):
# load model from model_name prefix and epoch of model_num_epoch with gpu contexts of contexts
mode = args.config.get('common', 'mode')
load_optimizer_states = args.config.getboolean('load', 'load_optimizer_states')
- is_start_from_batch = args.config.getboolean('load','is_start_from_batch')
+ is_start_from_batch = args.config.getboolean('load', 'is_start_from_batch')
from importlib import import_module
symbol_template = import_module(args.config.get('arch', 'arch_file'))
- model_loaded = symbol_template.arch(args)
+ is_bucketing = args.config.getboolean('arch', 'is_bucketing')
if mode == 'train':
+ if is_bucketing:
+ bucketing_arch = symbol_template.BucketingArch(args)
+ model_loaded = bucketing_arch.get_sym_gen()
+ else:
+ model_loaded = symbol_template.arch(args)
model_num_epoch = None
- else:
+ elif mode == 'load' or mode == 'predict':
model_file = args.config.get('common', 'model_file')
model_name = os.path.splitext(model_file)[0]
-
model_num_epoch = int(model_name[-4:])
+ if is_bucketing:
+ bucketing_arch = symbol_template.BucketingArch(args)
+ model_loaded = bucketing_arch.get_sym_gen()
+ else:
+ model_path = 'checkpoints/' + str(model_name[:-5])
- model_path = 'checkpoints/' + str(model_name[:-5])
-
- data_names = [x[0] for x in data_train.provide_data]
- label_names = [x[0] for x in data_train.provide_label]
+ data_names = [x[0] for x in data_train.provide_data]
+ label_names = [x[0] for x in data_train.provide_label]
- model_loaded = mx.module.Module.load(prefix=model_path, epoch=model_num_epoch, context=contexts,
- data_names=data_names, label_names=label_names,
- load_optimizer_states=load_optimizer_states)
+ model_loaded = mx.module.Module.load(
+ prefix=model_path, epoch=model_num_epoch, context=contexts,
+ data_names=data_names, label_names=label_names,
+ load_optimizer_states=load_optimizer_states)
if is_start_from_batch:
import re
model_num_epoch = int(re.findall('\d+', model_file)[0])
@@ -198,7 +243,8 @@ def load_model(args, contexts, data_train):
if __name__ == '__main__':
if len(sys.argv) <= 1:
- raise Exception('cfg file path must be provided. ex)python main.py --configfile examplecfg.cfg')
+ raise Exception('cfg file path must be provided. ' +
+ 'ex)python main.py --configfile examplecfg.cfg')
args = parse_args(sys.argv[1])
# set parameters from cfg file
# give random seed
@@ -206,9 +252,9 @@ def load_model(args, contexts, data_train):
mx_random_seed = args.config.getint('common', 'mx_random_seed')
# random seed for shuffling data list
if random_seed != -1:
- random.seed(random_seed)
+ np.random.seed(random_seed)
# set mx.random.seed to give seed for parameter initialization
- if mx_random_seed !=-1:
+ if mx_random_seed != -1:
mx.random.seed(mx_random_seed)
else:
mx.random.seed(hash(datetime.now()))
@@ -220,22 +266,23 @@ def load_model(args, contexts, data_train):
mode = args.config.get('common', 'mode')
if mode not in ['train', 'predict', 'load']:
raise Exception(
- 'Define mode in the cfg file first. train or predict or load can be the candidate for the mode.')
+ 'Define mode in the cfg file first. ' +
+ 'train or predict or load can be the candidate for the mode.')
# get meta file where character to number conversions are defined
contexts = parse_contexts(args)
num_gpu = len(contexts)
batch_size = args.config.getint('common', 'batch_size')
-
# check the number of gpus is positive divisor of the batch size for data parallel
if batch_size % num_gpu != 0:
raise Exception('num_gpu should be positive divisor of batch_size')
-
- if mode == "predict":
- data_train, args = load_data(args)
- elif mode == "train" or mode == "load":
+ if mode == "train" or mode == "load":
data_train, data_val, args = load_data(args)
+ elif mode == "predict":
+ data_train, args = load_data(args)
+ is_batchnorm = args.config.getboolean('arch', 'is_batchnorm')
+ is_bucketing = args.config.getboolean('arch', 'is_bucketing')
# log current config
config_logger = ConfigLogger(log)
@@ -243,28 +290,63 @@ def load_model(args, contexts, data_train):
# load model
model_loaded, model_num_epoch = load_model(args, contexts, data_train)
-
# if mode is 'train', it trains the model
if mode == 'train':
- data_names = [x[0] for x in data_train.provide_data]
- label_names = [x[0] for x in data_train.provide_label]
- module = mx.mod.Module(model_loaded, context=contexts, data_names=data_names, label_names=label_names)
+ if is_bucketing:
+ module = STTBucketingModule(
+ sym_gen=model_loaded,
+ default_bucket_key=data_train.default_bucket_key,
+ context=contexts
+ )
+ else:
+ data_names = [x[0] for x in data_train.provide_data]
+ label_names = [x[0] for x in data_train.provide_label]
+ module = mx.mod.Module(model_loaded, context=contexts,
+ data_names=data_names, label_names=label_names)
do_training(args=args, module=module, data_train=data_train, data_val=data_val)
# if mode is 'load', it loads model from the checkpoint and continues the training.
elif mode == 'load':
- do_training(args=args, module=model_loaded, data_train=data_train, data_val=data_val, begin_epoch=model_num_epoch+1)
+ do_training(args=args, module=model_loaded, data_train=data_train, data_val=data_val,
+ begin_epoch=model_num_epoch + 1)
# if mode is 'predict', it predict label from the input by the input model
elif mode == 'predict':
# predict through data
- model_loaded.bind(for_training=False, data_shapes=data_train.provide_data,
- label_shapes=data_train.provide_label)
+ if is_bucketing:
+ max_t_count = args.config.getint('arch', 'max_t_count')
+ load_optimizer_states = args.config.getboolean('load', 'load_optimizer_states')
+ model_file = args.config.get('common', 'model_file')
+ model_name = os.path.splitext(model_file)[0]
+ model_num_epoch = int(model_name[-4:])
+
+ model_path = 'checkpoints/' + str(model_name[:-5])
+ model = STTBucketingModule(
+ sym_gen=model_loaded,
+ default_bucket_key=data_train.default_bucket_key,
+ context=contexts
+ )
+
+ model.bind(data_shapes=data_train.provide_data,
+ label_shapes=data_train.provide_label,
+ for_training=True)
+ _, arg_params, aux_params = mx.model.load_checkpoint(model_path, model_num_epoch)
+ model.set_params(arg_params, aux_params)
+ model_loaded = model
+ else:
+ model_loaded.bind(for_training=False, data_shapes=data_train.provide_data,
+ label_shapes=data_train.provide_label)
max_t_count = args.config.getint('arch', 'max_t_count')
- eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=max_t_count)
- is_batchnorm = args.config.getboolean('arch', 'is_batchnorm')
- if is_batchnorm :
+ eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu)
+ if is_batchnorm:
for nbatch, data_batch in enumerate(data_train):
- # when is_train = False it leads to high cer when batch_norm
- model_loaded.forward(data_batch, is_train=True)
+ model_loaded.forward(data_batch, is_train=False)
model_loaded.update_metric(eval_metric, data_batch.label)
- else :
- model_loaded.score(eval_data=data_train, num_batch=None, eval_metric=eval_metric, reset=True)
+ else:
+ #model_loaded.score(eval_data=data_train, num_batch=None,
+ # eval_metric=eval_metric, reset=True)
+ for nbatch, data_batch in enumerate(data_train):
+ model_loaded.forward(data_batch, is_train=False)
+ model_loaded.update_metric(eval_metric, data_batch.label)
+ else:
+ raise Exception(
+ 'Define mode in the cfg file first. ' +
+ 'train or predict or load can be the candidate for the mode')
diff --git a/example/speech_recognition/stt_datagenerator.py b/example/speech_recognition/stt_datagenerator.py
index 390de432e751..d2a7b4b5cbae 100644
--- a/example/speech_recognition/stt_datagenerator.py
+++ b/example/speech_recognition/stt_datagenerator.py
@@ -2,7 +2,6 @@
import json
import random
-
import numpy as np
from stt_utils import calc_feat_dim, spectrogram_from_file
@@ -10,6 +9,7 @@
from log_util import LogUtil
from label_util import LabelUtil
from stt_bi_graphemes_util import generate_bi_graphemes_label
+from multiprocessing import cpu_count, Process, Manager
class DataGenerator(object):
def __init__(self, save_dir, model_name, step=10, window=20, max_freq=8000, desc_file=None):
@@ -32,7 +32,7 @@ def __init__(self, save_dir, model_name, step=10, window=20, max_freq=8000, desc
# 1d 161 length of array filled with 1s
self.feats_std = np.ones((self.feat_dim,))
self.max_input_length = 0
- self.max_length_list_in_batch =[]
+ self.max_length_list_in_batch = []
# 1d 161 length of array filled with random value
#[0.0, 1.0)
self.rng = random.Random()
@@ -48,14 +48,15 @@ def get_meta_from_file(self, feats_mean, feats_std):
self.feats_mean = feats_mean
self.feats_std = feats_std
- def featurize(self, audio_clip, overwrite=False):
+ def featurize(self, audio_clip, overwrite=False, save_feature_as_csvfile=False):
""" For a given audio clip, calculate the log of its Fourier Transform
Params:
audio_clip(str): Path to the audio clip
"""
return spectrogram_from_file(
audio_clip, step=self.step, window=self.window,
- max_freq=self.max_freq, overwrite=overwrite)
+ max_freq=self.max_freq, overwrite=overwrite,
+ save_feature_as_csvfile=save_feature_as_csvfile)
def load_metadata_from_desc_file(self, desc_file, partition='train',
max_duration=16.0,):
@@ -107,11 +108,11 @@ def load_metadata_from_desc_file(self, desc_file, partition='train',
raise Exception("Invalid partition to load metadata. "
"Must be train/validation/test")
- def load_train_data(self, desc_file):
- self.load_metadata_from_desc_file(desc_file, 'train')
+ def load_train_data(self, desc_file, max_duration):
+ self.load_metadata_from_desc_file(desc_file, 'train', max_duration=max_duration)
- def load_validation_data(self, desc_file):
- self.load_metadata_from_desc_file(desc_file, 'validation')
+ def load_validation_data(self, desc_file, max_duration):
+ self.load_metadata_from_desc_file(desc_file, 'validation', max_duration=max_duration)
@staticmethod
def sort_by_duration(durations, audio_paths, texts):
@@ -146,10 +147,11 @@ def get_max_seq_length(self, partition):
"Must be train/validation/test")
max_duration_indexes = durations.index(max(durations))
max_seq_length = self.featurize(audio_paths[max_duration_indexes]).shape[0]
- self.max_seq_length=max_seq_length
+ self.max_seq_length = max_seq_length
return max_seq_length
- def prepare_minibatch(self, audio_paths, texts, overwrite=False, is_bi_graphemes=False):
+ def prepare_minibatch(self, audio_paths, texts, overwrite=False,
+ is_bi_graphemes=False, seq_length=-1, save_feature_as_csvfile=False):
""" Featurize a minibatch of audio, zero pad them and return a dictionary
Params:
audio_paths (list(str)): List of paths to audio files
@@ -162,12 +164,15 @@ def prepare_minibatch(self, audio_paths, texts, overwrite=False, is_bi_graphemes
# Features is a list of (timesteps, feature_dim) arrays
# Calculate the features for each audio clip, as the log of the
# Fourier Transform of the audio
- features = [self.featurize(a, overwrite=overwrite) for a in audio_paths]
+ features = [self.featurize(a, overwrite=overwrite, save_feature_as_csvfile=save_feature_as_csvfile) for a in audio_paths]
input_lengths = [f.shape[0] for f in features]
feature_dim = features[0].shape[1]
mb_size = len(features)
# Pad all the inputs so that they are all the same length
- x = np.zeros((mb_size, self.max_seq_length, feature_dim))
+ if seq_length == -1:
+ x = np.zeros((mb_size, self.max_seq_length, feature_dim))
+ else:
+ x = np.zeros((mb_size, seq_length, feature_dim))
y = np.zeros((mb_size, self.max_label_length))
labelUtil = LabelUtil.getInstance()
label_lengths = []
@@ -199,34 +204,59 @@ def iterate_validation(self, minibatch_size=16):
return self.iterate(self.val_audio_paths, self.val_texts,
minibatch_size)
+ def preprocess_sample_normalize(self, threadIndex, audio_paths, overwrite, return_dict):
+ if len(audio_paths) > 0:
+ audio_clip = audio_paths[0]
+ feat = self.featurize(audio_clip=audio_clip, overwrite=overwrite)
+ feat_squared = np.square(feat)
+ count = float(feat.shape[0])
+ dim = feat.shape[1]
+ if len(audio_paths) > 1:
+ for audio_path in audio_paths[1:]:
+ next_feat = self.featurize(audio_clip=audio_path, overwrite=overwrite)
+ next_feat_squared = np.square(next_feat)
+ feat_vertically_stacked = np.concatenate((feat, next_feat)).reshape(-1, dim)
+ feat = np.sum(feat_vertically_stacked, axis=0, keepdims=True)
+ feat_squared_vertically_stacked = np.concatenate(
+ (feat_squared, next_feat_squared)).reshape(-1, dim)
+ feat_squared = np.sum(feat_squared_vertically_stacked, axis=0, keepdims=True)
+ count += float(next_feat.shape[0])
+ return_dict[threadIndex] = {'feat': feat, 'feat_squared': feat_squared, 'count': count}
+
def sample_normalize(self, k_samples=1000, overwrite=False):
""" Estimate the mean and std of the features from the training set
Params:
k_samples (int): Use this number of samples for estimation
"""
+ log = LogUtil().getlogger()
+ log.info("Calculating mean and std from samples")
# if k_samples is negative then it goes through total dataset
if k_samples < 0:
- audio_paths_iter = iter(self.audio_paths)
+ audio_paths = self.audio_paths
+
# using sample
else:
k_samples = min(k_samples, len(self.train_audio_paths))
samples = self.rng.sample(self.train_audio_paths, k_samples)
- audio_paths_iter = iter(samples)
- audio_clip = audio_paths_iter.next()
- feat = self.featurize(audio_clip=audio_clip, overwrite=overwrite)
- feat_squared = np.square(feat)
- count = float(feat.shape[0])
- dim = feat.shape[1]
-
- for iter_index in range(len(samples) - 1):
- next_feat = self.featurize(audio_clip=audio_paths_iter.next(), overwrite=overwrite)
- next_feat_squared = np.square(next_feat)
- feat_vertically_stacked = np.concatenate((feat, next_feat)).reshape(-1, dim)
- feat = np.sum(feat_vertically_stacked, axis=0, keepdims=True)
- feat_squared_vertically_stacked = np.concatenate((feat_squared, next_feat_squared)).reshape(-1, dim)
- feat_squared = np.sum(feat_squared_vertically_stacked, axis=0, keepdims=True)
- count = count + float(next_feat.shape[0])
+ audio_paths = samples
+ manager = Manager()
+ return_dict = manager.dict()
+ jobs = []
+ for threadIndex in range(cpu_count()):
+ proc = Process(target=self.preprocess_sample_normalize, args=(threadIndex, audio_paths, overwrite, return_dict))
+ jobs.append(proc)
+ proc.start()
+ for proc in jobs:
+ proc.join()
+
+ feat = np.sum(np.vstack([item['feat'] for item in return_dict.values()]), axis=0)
+ count = sum([item['count'] for item in return_dict.values()])
+ feat_squared = np.sum(np.vstack([item['feat_squared'] for item in return_dict.values()]), axis=0)
+
self.feats_mean = feat / float(count)
self.feats_std = np.sqrt(feat_squared / float(count) - np.square(self.feats_mean))
- np.savetxt(generate_file_path(self.save_dir, self.model_name, 'feats_mean'), self.feats_mean)
- np.savetxt(generate_file_path(self.save_dir, self.model_name, 'feats_std'), self.feats_std)
+ np.savetxt(
+ generate_file_path(self.save_dir, self.model_name, 'feats_mean'), self.feats_mean)
+ np.savetxt(
+ generate_file_path(self.save_dir, self.model_name, 'feats_std'), self.feats_std)
+ log.info("End calculating mean and std from samples")
diff --git a/example/speech_recognition/stt_io_iter.py b/example/speech_recognition/stt_io_iter.py
index 70c31ce92dde..5ae65191c840 100644
--- a/example/speech_recognition/stt_io_iter.py
+++ b/example/speech_recognition/stt_io_iter.py
@@ -31,7 +31,8 @@ def provide_label(self):
class STTIter(mx.io.DataIter):
def __init__(self, count, datagen, batch_size, num_label, init_states, seq_length, width, height,
sort_by_duration=True,
- is_bi_graphemes=False, partition="train",):
+ is_bi_graphemes=False, partition="train",
+ save_feature_as_csvfile=False):
super(STTIter, self).__init__()
self.batch_size = batch_size
self.num_label = num_label
@@ -75,6 +76,7 @@ def __init__(self, count, datagen, batch_size, num_label, init_states, seq_lengt
self.trainDataIter = iter(self.trainDataList)
self.is_first_epoch = True
+ self.save_feature_as_csvfile = save_feature_as_csvfile
def __iter__(self):
init_state_names = [x[0] for x in self.init_states]
@@ -92,9 +94,9 @@ def __iter__(self):
audio_paths.append(audio_path)
texts.append(text)
if self.is_first_epoch:
- data_set = self.datagen.prepare_minibatch(audio_paths, texts, overwrite=True, is_bi_graphemes=self.is_bi_graphemes)
+ data_set = self.datagen.prepare_minibatch(audio_paths, texts, overwrite=True, is_bi_graphemes=self.is_bi_graphemes, save_feature_as_csvfile=self.save_feature_as_csvfile)
else:
- data_set = self.datagen.prepare_minibatch(audio_paths, texts, overwrite=False, is_bi_graphemes=self.is_bi_graphemes)
+ data_set = self.datagen.prepare_minibatch(audio_paths, texts, overwrite=False, is_bi_graphemes=self.is_bi_graphemes, save_feature_as_csvfile=self.save_feature_as_csvfile)
data_all = [mx.nd.array(data_set['x'])] + self.init_state_arrays
label_all = [mx.nd.array(data_set['y'])]
@@ -103,7 +105,6 @@ def __iter__(self):
data_batch = SimpleBatch(data_names, data_all, label_names, label_all)
yield data_batch
- self.is_first_epoch = False
def reset(self):
pass
diff --git a/example/speech_recognition/stt_layer_batchnorm.py b/example/speech_recognition/stt_layer_batchnorm.py
index 86e75aa49557..5b73f4f9f890 100644
--- a/example/speech_recognition/stt_layer_batchnorm.py
+++ b/example/speech_recognition/stt_layer_batchnorm.py
@@ -6,7 +6,7 @@ def batchnorm(net,
beta=None,
eps=0.001,
momentum=0.9,
- fix_gamma=True,
+ fix_gamma=False,
use_global_stats=False,
output_mean_var=False,
name=None):
@@ -18,7 +18,8 @@ def batchnorm(net,
momentum=momentum,
fix_gamma=fix_gamma,
use_global_stats=use_global_stats,
- output_mean_var=output_mean_var
+ output_mean_var=output_mean_var,
+ name=name
)
else:
net = mx.sym.BatchNorm(data=net,
@@ -26,6 +27,7 @@ def batchnorm(net,
momentum=momentum,
fix_gamma=fix_gamma,
use_global_stats=use_global_stats,
- output_mean_var=output_mean_var
+ output_mean_var=output_mean_var,
+ name=name
)
return net
diff --git a/example/speech_recognition/stt_layer_conv.py b/example/speech_recognition/stt_layer_conv.py
index 5ec292557f04..ab0035e4803b 100644
--- a/example/speech_recognition/stt_layer_conv.py
+++ b/example/speech_recognition/stt_layer_conv.py
@@ -8,20 +8,22 @@ def conv(net,
weight=None,
bias=None,
act_type="relu",
- no_bias=False
+ no_bias=False,
+ name=None
):
# 2d convolution's input should have the shape of 4D (batch_size,1,seq_len,feat_dim)
if weight is None or bias is None:
# ex) filter_dimension = (41,11) , stride=(2,2)
- net = mx.sym.Convolution(data=net, num_filter=channels, kernel=filter_dimension, stride=stride, no_bias=no_bias)
+ net = mx.sym.Convolution(data=net, num_filter=channels, kernel=filter_dimension, stride=stride, no_bias=no_bias,
+ name=name)
elif weight is None or bias is not None:
net = mx.sym.Convolution(data=net, num_filter=channels, kernel=filter_dimension, stride=stride, bias=bias,
- no_bias=no_bias)
+ no_bias=no_bias, name=name)
elif weight is not None or bias is None:
net = mx.sym.Convolution(data=net, num_filter=channels, kernel=filter_dimension, stride=stride, weight=weight,
- no_bias=no_bias)
+ no_bias=no_bias, name=name)
else:
net = mx.sym.Convolution(data=net, num_filter=channels, kernel=filter_dimension, stride=stride, weight=weight,
- bias=bias, no_bias=no_bias)
+ bias=bias, no_bias=no_bias, name=name)
net = mx.sym.Activation(data=net, act_type=act_type)
return net
diff --git a/example/speech_recognition/stt_layer_fc.py b/example/speech_recognition/stt_layer_fc.py
index b3db2034a3ad..f435922426c5 100644
--- a/example/speech_recognition/stt_layer_fc.py
+++ b/example/speech_recognition/stt_layer_fc.py
@@ -8,29 +8,30 @@ def fc(net,
act_type,
weight=None,
bias=None,
- no_bias=False
+ no_bias=False,
+ name=None
):
# when weight and bias doesn't have specific name
if weight is None and bias is None:
- net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, no_bias=no_bias)
+ net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, no_bias=no_bias, name=name)
# when weight doesn't have specific name but bias has
elif weight is None and bias is not None:
if no_bias:
- net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, no_bias=no_bias)
+ net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, no_bias=no_bias, name=name)
else:
- net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, bias=bias, no_bias=no_bias)
+ net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, bias=bias, no_bias=no_bias, name=name)
# when bias doesn't have specific name but weight has
elif weight is not None and bias is None:
- net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, no_bias=no_bias)
+ net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, no_bias=no_bias, name=name)
# when weight and bias specific name
else:
if no_bias:
- net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, no_bias=no_bias)
+ net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, no_bias=no_bias, name=name)
else:
- net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, bias=bias, no_bias=no_bias)
+ net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, bias=bias, no_bias=no_bias, name=name)
# activation
if act_type is not None:
- net = mx.sym.Activation(data=net, act_type=act_type)
+ net = mx.sym.Activation(data=net, act_type=act_type, name="%s_activation" % name)
return net
@@ -41,7 +42,7 @@ def sequence_fc(net,
num_hidden_list=[],
act_type_list=[],
is_batchnorm=False,
- dropout_rate=0
+ dropout_rate=0,
):
if num_layer == len(num_hidden_list) == len(act_type_list):
if num_layer > 0:
@@ -81,13 +82,16 @@ def sequence_fc(net,
num_hidden=num_hidden_list[layer_index],
act_type=None,
weight=weight_list[layer_index],
- no_bias=is_batchnorm
+ no_bias=is_batchnorm,
+ name="%s_t%d_l%d_fc" % (prefix, seq_index, layer_index)
)
# last layer doesn't have batchnorm
hidden = batchnorm(net=hidden,
gamma=gamma_list[layer_index],
- beta=beta_list[layer_index])
- hidden = mx.sym.Activation(data=hidden, act_type=act_type_list[layer_index])
+ beta=beta_list[layer_index],
+ name="%s_t%d_l%d_batchnorm" % (prefix, seq_index, layer_index))
+ hidden = mx.sym.Activation(data=hidden, act_type=act_type_list[layer_index],
+ name="%s_t%d_l%d_activation" % (prefix, seq_index, layer_index))
else:
hidden = fc(net=hidden,
num_hidden=num_hidden_list[layer_index],
diff --git a/example/speech_recognition/stt_layer_gru.py b/example/speech_recognition/stt_layer_gru.py
index 8b044746dfcf..89af1c72216d 100644
--- a/example/speech_recognition/stt_layer_gru.py
+++ b/example/speech_recognition/stt_layer_gru.py
@@ -15,7 +15,7 @@
"param_blocks"])
-def gru(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., is_batchnorm=False, gamma=None, beta=None):
+def gru(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., is_batchnorm=False, gamma=None, beta=None, name=None):
"""
GRU Cell symbol
Reference:
@@ -31,7 +31,10 @@ def gru(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., is_
name="t%d_l%d_gates_i2h" % (seqidx, layeridx))
if is_batchnorm:
- i2h = batchnorm(net=i2h, gamma=gamma, beta=beta)
+ if name is not None:
+ i2h = batchnorm(net=i2h, gamma=gamma, beta=beta, name="%s_batchnorm" % name)
+ else:
+ i2h = batchnorm(net=i2h, gamma=gamma, beta=beta)
h2h = mx.sym.FullyConnected(data=prev_state.h,
weight=param.gates_h2h_weight,
bias=param.gates_h2h_bias,
@@ -53,15 +56,15 @@ def gru(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., is_
weight=param.trans_h2h_weight,
bias=param.trans_h2h_bias,
num_hidden=num_hidden,
- name="t%d_l%d_trans_i2h" % (seqidx, layeridx))
+ name="t%d_l%d_trans_h2h" % (seqidx, layeridx))
h_trans = htrans_i2h + htrans_h2h
h_trans_active = mx.sym.Activation(h_trans, act_type="tanh")
next_h = prev_state.h + update_gate * (h_trans_active - prev_state.h)
return GRUState(h=next_h)
-def gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_batchnorm=False, prefix="",
- direction="forward"):
+def gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_batchnorm=False, prefix="",
+ direction="forward", is_bucketing=False):
if num_gru_layer > 0:
param_cells = []
last_states = []
@@ -81,9 +84,14 @@ def gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_
if is_batchnorm:
batchnorm_gamma = []
batchnorm_beta = []
- for seqidx in range(seq_len):
- batchnorm_gamma.append(mx.sym.Variable(prefix + "t%d_i2h_gamma" % seqidx))
- batchnorm_beta.append(mx.sym.Variable(prefix + "t%d_i2h_beta" % seqidx))
+ if is_bucketing:
+ for l in range(num_gru_layer):
+ batchnorm_gamma.append(mx.sym.Variable(prefix + "l%d_i2h_gamma" % l))
+ batchnorm_beta.append(mx.sym.Variable(prefix + "l%d_i2h_beta" % l))
+ else:
+ for seqidx in range(seq_len):
+ batchnorm_gamma.append(mx.sym.Variable(prefix + "t%d_i2h_gamma" % seqidx))
+ batchnorm_beta.append(mx.sym.Variable(prefix + "t%d_i2h_beta" % seqidx))
hidden_all = []
for seqidx in range(seq_len):
@@ -103,19 +111,33 @@ def gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_
else:
dp_ratio = dropout
if is_batchnorm:
- next_state = gru(num_hidden_gru_list[i], indata=hidden,
- prev_state=last_states[i],
- param=param_cells[i],
- seqidx=k, layeridx=i, dropout=dp_ratio,
- is_batchnorm=is_batchnorm,
- gamma=batchnorm_gamma[k],
- beta=batchnorm_beta[k])
+ if is_bucketing:
+ next_state = gru(num_hidden_gru_list[i], indata=hidden,
+ prev_state=last_states[i],
+ param=param_cells[i],
+ seqidx=k, layeridx=i, dropout=dp_ratio,
+ is_batchnorm=is_batchnorm,
+ gamma=batchnorm_gamma[i],
+ beta=batchnorm_beta[i],
+ name=prefix + ("t%d_l%d" % (seqidx, i))
+ )
+ else:
+ next_state = gru(num_hidden_gru_list[i], indata=hidden,
+ prev_state=last_states[i],
+ param=param_cells[i],
+ seqidx=k, layeridx=i, dropout=dp_ratio,
+ is_batchnorm=is_batchnorm,
+ gamma=batchnorm_gamma[k],
+ beta=batchnorm_beta[k],
+ name=prefix + ("t%d_l%d" % (seqidx, i))
+ )
else:
next_state = gru(num_hidden_gru_list[i], indata=hidden,
prev_state=last_states[i],
param=param_cells[i],
seqidx=k, layeridx=i, dropout=dp_ratio,
- is_batchnorm=is_batchnorm)
+ is_batchnorm=is_batchnorm,
+ name=prefix)
hidden = next_state.h
last_states[i] = next_state
# decoder
@@ -133,7 +155,7 @@ def gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_
return net
-def bi_gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_batchnorm=False):
+def bi_gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_batchnorm=False, is_bucketing=False):
if num_gru_layer > 0:
net_forward = gru_unroll(net=net,
num_gru_layer=num_gru_layer,
@@ -142,7 +164,8 @@ def bi_gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0.,
dropout=dropout,
is_batchnorm=is_batchnorm,
prefix="forward_",
- direction="forward")
+ direction="forward",
+ is_bucketing=is_bucketing)
net_backward = gru_unroll(net=net,
num_gru_layer=num_gru_layer,
seq_len=seq_len,
@@ -150,7 +173,8 @@ def bi_gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0.,
dropout=dropout,
is_batchnorm=is_batchnorm,
prefix="backward_",
- direction="backward")
+ direction="backward",
+ is_bucketing=is_bucketing)
hidden_all = []
for i in range(seq_len):
hidden_all.append(mx.sym.Concat(*[net_forward[i], net_backward[i]], dim=1))
@@ -159,7 +183,7 @@ def bi_gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0.,
def bi_gru_unroll_two_input_two_output(net1, net2, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0.,
- is_batchnorm=False):
+ is_batchnorm=False, is_bucketing=False):
if num_gru_layer > 0:
net_forward = gru_unroll(net=net1,
num_gru_layer=num_gru_layer,
@@ -168,7 +192,8 @@ def bi_gru_unroll_two_input_two_output(net1, net2, num_gru_layer, seq_len, num_h
dropout=dropout,
is_batchnorm=is_batchnorm,
prefix="forward_",
- direction="forward")
+ direction="forward",
+ is_bucketing=is_bucketing)
net_backward = gru_unroll(net=net2,
num_gru_layer=num_gru_layer,
seq_len=seq_len,
@@ -176,7 +201,8 @@ def bi_gru_unroll_two_input_two_output(net1, net2, num_gru_layer, seq_len, num_h
dropout=dropout,
is_batchnorm=is_batchnorm,
prefix="backward_",
- direction="backward")
+ direction="backward",
+ is_bucketing=is_bucketing)
return net_forward, net_backward
else:
return net1, net2
diff --git a/example/speech_recognition/stt_layer_lstm.py b/example/speech_recognition/stt_layer_lstm.py
index 19e37369b1b0..93b4ca09b908 100644
--- a/example/speech_recognition/stt_layer_lstm.py
+++ b/example/speech_recognition/stt_layer_lstm.py
@@ -16,7 +16,7 @@
"param_blocks"])
-def vanilla_lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, is_batchnorm=False, gamma=None, beta=None):
+def vanilla_lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, is_batchnorm=False, gamma=None, beta=None, name=None):
"""LSTM Cell symbol"""
i2h = mx.sym.FullyConnected(data=indata,
weight=param.i2h_weight,
@@ -24,7 +24,10 @@ def vanilla_lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, is_bat
num_hidden=num_hidden * 4,
name="t%d_l%d_i2h" % (seqidx, layeridx))
if is_batchnorm:
- i2h = batchnorm(net=i2h, gamma=gamma, beta=beta)
+ if name is not None:
+ i2h = batchnorm(net=i2h, gamma=gamma, beta=beta, name="%s_batchnorm" % name)
+ else:
+ i2h = batchnorm(net=i2h, gamma=gamma, beta=beta)
h2h = mx.sym.FullyConnected(data=prev_state.h,
weight=param.h2h_weight,
bias=param.h2h_bias,
@@ -43,7 +46,7 @@ def vanilla_lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, is_bat
def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., num_hidden_proj=0, is_batchnorm=False,
- gamma=None, beta=None):
+ gamma=None, beta=None, name=None):
"""LSTM Cell symbol"""
# dropout input
if dropout > 0.:
@@ -55,7 +58,10 @@ def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., nu
num_hidden=num_hidden * 4,
name="t%d_l%d_i2h" % (seqidx, layeridx))
if is_batchnorm:
- i2h = batchnorm(net=i2h, gamma=gamma, beta=beta)
+ if name is not None:
+ i2h = batchnorm(net=i2h, gamma=gamma, beta=beta, name="%s_batchnorm" % name)
+ else:
+ i2h = batchnorm(net=i2h, gamma=gamma, beta=beta)
h2h = mx.sym.FullyConnected(data=prev_state.h,
weight=param.h2h_weight,
@@ -96,7 +102,7 @@ def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., nu
def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0., num_hidden_proj=0,
- lstm_type='fc_lstm', is_batchnorm=False, prefix="", direction="forward"):
+ lstm_type='fc_lstm', is_batchnorm=False, prefix="", direction="forward", is_bucketing=False):
if num_lstm_layer > 0:
param_cells = []
last_states = []
@@ -121,9 +127,14 @@ def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0.,
if is_batchnorm:
batchnorm_gamma = []
batchnorm_beta = []
- for seqidx in range(seq_len):
- batchnorm_gamma.append(mx.sym.Variable(prefix + "t%d_i2h_gamma" % seqidx))
- batchnorm_beta.append(mx.sym.Variable(prefix + "t%d_i2h_beta" % seqidx))
+ if is_bucketing:
+ for l in range(num_lstm_layer):
+ batchnorm_gamma.append(mx.sym.Variable(prefix + "l%d_i2h_gamma" % l))
+ batchnorm_beta.append(mx.sym.Variable(prefix + "l%d_i2h_beta" % l))
+ else:
+ for seqidx in range(seq_len):
+ batchnorm_gamma.append(mx.sym.Variable(prefix + "t%d_i2h_gamma" % seqidx))
+ batchnorm_beta.append(mx.sym.Variable(prefix + "t%d_i2h_beta" % seqidx))
hidden_all = []
for seqidx in range(seq_len):
@@ -145,18 +156,20 @@ def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0.,
if lstm_type == 'fc_lstm':
if is_batchnorm:
- next_state = lstm(num_hidden_lstm_list[i],
- indata=hidden,
- prev_state=last_states[i],
- param=param_cells[i],
- seqidx=k,
- layeridx=i,
- dropout=dp,
- num_hidden_proj=num_hidden_proj,
- is_batchnorm=is_batchnorm,
- gamma=batchnorm_gamma[k],
- beta=batchnorm_beta[k]
- )
+ if is_bucketing:
+ next_state = lstm(num_hidden_lstm_list[i],
+ indata=hidden,
+ prev_state=last_states[i],
+ param=param_cells[i],
+ seqidx=k,
+ layeridx=i,
+ dropout=dp,
+ num_hidden_proj=num_hidden_proj,
+ is_batchnorm=is_batchnorm,
+ gamma=batchnorm_gamma[i],
+ beta=batchnorm_beta[i],
+ name=prefix + ("t%d_l%d" % (seqidx, i))
+ )
else:
next_state = lstm(num_hidden_lstm_list[i],
indata=hidden,
@@ -166,7 +179,8 @@ def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0.,
layeridx=i,
dropout=dp,
num_hidden_proj=num_hidden_proj,
- is_batchnorm=is_batchnorm
+ is_batchnorm=is_batchnorm,
+ name=prefix + ("t%d_l%d" % (seqidx, i))
)
elif lstm_type == 'vanilla_lstm':
if is_batchnorm:
@@ -175,15 +189,17 @@ def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0.,
param=param_cells[i],
seqidx=k, layeridx=i,
is_batchnorm=is_batchnorm,
- gamma=batchnorm_gamma[k],
- beta=batchnorm_beta[k]
+ gamma=batchnorm_gamma[i],
+ beta=batchnorm_beta[i],
+ name=prefix + ("t%d_l%d" % (seqidx, i))
)
else:
next_state = vanilla_lstm(num_hidden_lstm_list[i], indata=hidden,
prev_state=last_states[i],
param=param_cells[i],
seqidx=k, layeridx=i,
- is_batchnorm=is_batchnorm
+ is_batchnorm=is_batchnorm,
+ name=prefix + ("t%d_l%d" % (seqidx, i))
)
else:
raise Exception("lstm type %s error" % lstm_type)
@@ -206,7 +222,7 @@ def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0.,
def bi_lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0., num_hidden_proj=0,
- lstm_type='fc_lstm', is_batchnorm=False):
+ lstm_type='fc_lstm', is_batchnorm=False, is_bucketing=False):
if num_lstm_layer > 0:
net_forward = lstm_unroll(net=net,
num_lstm_layer=num_lstm_layer,
@@ -217,7 +233,8 @@ def bi_lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0
lstm_type=lstm_type,
is_batchnorm=is_batchnorm,
prefix="forward_",
- direction="forward")
+ direction="forward",
+ is_bucketing=is_bucketing)
net_backward = lstm_unroll(net=net,
num_lstm_layer=num_lstm_layer,
@@ -228,7 +245,8 @@ def bi_lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0
lstm_type=lstm_type,
is_batchnorm=is_batchnorm,
prefix="backward_",
- direction="backward")
+ direction="backward",
+ is_bucketing=is_bucketing)
hidden_all = []
for i in range(seq_len):
hidden_all.append(mx.sym.Concat(*[net_forward[i], net_backward[i]], dim=1))
@@ -239,7 +257,9 @@ def bi_lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0
# bilistm_2to1
def bi_lstm_unroll_two_input_two_output(net1, net2, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0.,
num_hidden_proj=0,
- lstm_type='fc_lstm', is_batchnorm=False):
+ lstm_type='fc_lstm',
+ is_batchnorm=False,
+ is_bucketing=False):
if num_lstm_layer > 0:
net_forward = lstm_unroll(net=net1,
num_lstm_layer=num_lstm_layer,
@@ -250,7 +270,8 @@ def bi_lstm_unroll_two_input_two_output(net1, net2, num_lstm_layer, seq_len, num
lstm_type=lstm_type,
is_batchnorm=is_batchnorm,
prefix="forward_",
- direction="forward")
+ direction="forward",
+ is_bucketing=is_bucketing)
net_backward = lstm_unroll(net=net2,
num_lstm_layer=num_lstm_layer,
@@ -261,7 +282,8 @@ def bi_lstm_unroll_two_input_two_output(net1, net2, num_lstm_layer, seq_len, num
lstm_type=lstm_type,
is_batchnorm=is_batchnorm,
prefix="backward_",
- direction="backward")
+ direction="backward",
+ is_bucketing=is_bucketing)
return net_forward, net_backward
else:
return net1, net2
diff --git a/example/speech_recognition/stt_metric.py b/example/speech_recognition/stt_metric.py
index 0fc2bd11d906..1c5f4408a60e 100644
--- a/example/speech_recognition/stt_metric.py
+++ b/example/speech_recognition/stt_metric.py
@@ -19,12 +19,11 @@ def check_label_shapes(labels, preds, shape=0):
class STTMetric(mx.metric.EvalMetric):
- def __init__(self, batch_size, num_gpu, seq_length, is_epoch_end=False, is_logging=True):
+ def __init__(self, batch_size, num_gpu, is_epoch_end=False, is_logging=True):
super(STTMetric, self).__init__('STTMetric')
self.batch_size = batch_size
self.num_gpu = num_gpu
- self.seq_length = seq_length
self.total_n_label = 0
self.total_l_dist = 0
self.is_epoch_end = is_epoch_end
@@ -37,15 +36,17 @@ def update(self, labels, preds):
log = LogUtil().getlogger()
labelUtil = LabelUtil.getInstance()
self.batch_loss = 0.
+
for label, pred in zip(labels, preds):
label = label.asnumpy()
pred = pred.asnumpy()
- for i in range(int(int(self.batch_size) / int(self.num_gpu))):
+ seq_length = len(pred) / int(int(self.batch_size) / int(self.num_gpu))
+ for i in range(int(int(self.batch_size) / int(self.num_gpu))):
l = remove_blank(label[i])
p = []
- for k in range(int(self.seq_length)):
+ for k in range(int(seq_length)):
p.append(np.argmax(pred[k * int(int(self.batch_size) / int(self.num_gpu)) + i]))
p = pred_best(p)
@@ -60,7 +61,7 @@ def update(self, labels, preds):
self.num_inst += 1
self.sum_metric += this_cer
if self.is_epoch_end:
- loss = ctc_loss(l, pred, i, int(self.seq_length), int(self.batch_size), int(self.num_gpu))
+ loss = ctc_loss(l, pred, i, int(seq_length), int(self.batch_size), int(self.num_gpu))
self.batch_loss += loss
if self.is_logging:
log.info("loss: %f " % loss)
diff --git a/example/speech_recognition/stt_utils.py b/example/speech_recognition/stt_utils.py
index 6a32f0e57c2d..3c7ffce0f980 100644
--- a/example/speech_recognition/stt_utils.py
+++ b/example/speech_recognition/stt_utils.py
@@ -92,7 +92,7 @@ def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128):
def spectrogram_from_file(filename, step=10, window=20, max_freq=None,
- eps=1e-14, overwrite=False):
+ eps=1e-14, overwrite=False, save_feature_as_csvfile=False):
""" Calculate the log of linear spectrogram from FFT energy
Params:
filename (str): Path to the audio file
@@ -104,7 +104,7 @@ def spectrogram_from_file(filename, step=10, window=20, max_freq=None,
"""
csvfilename = filename.replace(".wav", ".csv")
- if (os.path.isfile(csvfilename) is False) or overwrite:
+ if (os.path.isfile(csvfilename) is False) or overwrite:
with soundfile.SoundFile(filename) as sound_file:
audio = sound_file.read(dtype='float32')
sample_rate = sound_file.samplerate
@@ -126,7 +126,8 @@ def spectrogram_from_file(filename, step=10, window=20, max_freq=None,
ind = np.where(freqs <= max_freq)[0][-1] + 1
res = np.transpose(np.log(pxx[:ind, :] + eps))
- np.savetxt(csvfilename, res)
+ if save_feature_as_csvfile:
+ np.savetxt(csvfilename, res)
return res
else:
return np.loadtxt(csvfilename)
diff --git a/example/speech_recognition/train.py b/example/speech_recognition/train.py
index 37f00fc4dd90..f3a7555529e3 100644
--- a/example/speech_recognition/train.py
+++ b/example/speech_recognition/train.py
@@ -7,7 +7,9 @@
from stt_metric import STTMetric
#tensorboard setting
from tensorboard import SummaryWriter
-import numpy as np
+import json
+from stt_bucketing_module import STTBucketingModule
+
def get_initializer(args):
@@ -28,6 +30,7 @@ def __init__(self, learning_rate=0.001):
def __call__(self, num_update):
return self.learning_rate
+
def do_training(args, module, data_train, data_val, begin_epoch=0):
from distutils.dir_util import mkpath
from log_util import LogUtil
@@ -35,7 +38,7 @@ def do_training(args, module, data_train, data_val, begin_epoch=0):
log = LogUtil().getlogger()
mkpath(os.path.dirname(get_checkpoint_path(args)))
- seq_len = args.config.get('arch', 'max_t_count')
+ #seq_len = args.config.get('arch', 'max_t_count')
batch_size = args.config.getint('common', 'batch_size')
save_checkpoint_every_n_epoch = args.config.getint('common', 'save_checkpoint_every_n_epoch')
save_checkpoint_every_n_batch = args.config.getint('common', 'save_checkpoint_every_n_batch')
@@ -44,27 +47,48 @@ def do_training(args, module, data_train, data_val, begin_epoch=0):
contexts = parse_contexts(args)
num_gpu = len(contexts)
- eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=seq_len,is_logging=enable_logging_validation_metric,is_epoch_end=True)
+ eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_validation_metric,is_epoch_end=True)
# tensorboard setting
- loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=seq_len,is_logging=enable_logging_train_metric,is_epoch_end=False)
+ loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_train_metric,is_epoch_end=False)
- optimizer = args.config.get('train', 'optimizer')
- momentum = args.config.getfloat('train', 'momentum')
+ optimizer = args.config.get('optimizer', 'optimizer')
learning_rate = args.config.getfloat('train', 'learning_rate')
learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing')
mode = args.config.get('common', 'mode')
num_epoch = args.config.getint('train', 'num_epoch')
- clip_gradient = args.config.getfloat('train', 'clip_gradient')
- weight_decay = args.config.getfloat('train', 'weight_decay')
+ clip_gradient = args.config.getfloat('optimizer', 'clip_gradient')
+ weight_decay = args.config.getfloat('optimizer', 'weight_decay')
save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states')
show_every = args.config.getint('train', 'show_every')
+ optimizer_params_dictionary = json.loads(args.config.get('optimizer', 'optimizer_params_dictionary'))
+ kvstore_option = args.config.get('common', 'kvstore_option')
n_epoch=begin_epoch
+ is_bucketing = args.config.getboolean('arch', 'is_bucketing')
if clip_gradient == 0:
clip_gradient = None
+ if is_bucketing and mode == 'load':
+ model_file = args.config.get('common', 'model_file')
+ model_name = os.path.splitext(model_file)[0]
+ model_num_epoch = int(model_name[-4:])
+
+ model_path = 'checkpoints/' + str(model_name[:-5])
+ symbol, data_names, label_names = module(1600)
+ model = STTBucketingModule(
+ sym_gen=module,
+ default_bucket_key=data_train.default_bucket_key,
+ context=contexts)
+ data_train.reset()
- module.bind(data_shapes=data_train.provide_data,
+ model.bind(data_shapes=data_train.provide_data,
+ label_shapes=data_train.provide_label,
+ for_training=True)
+ _, arg_params, aux_params = mx.model.load_checkpoint(model_path, model_num_epoch)
+ model.set_params(arg_params, aux_params)
+ module = model
+ else:
+ module.bind(data_shapes=data_train.provide_data,
label_shapes=data_train.provide_label,
for_training=True)
@@ -75,41 +99,32 @@ def do_training(args, module, data_train, data_val, begin_epoch=0):
lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate)
def reset_optimizer(force_init=False):
- if optimizer == "sgd":
- module.init_optimizer(kvstore='device',
- optimizer=optimizer,
- optimizer_params={'lr_scheduler': lr_scheduler,
- 'momentum': momentum,
- 'clip_gradient': clip_gradient,
- 'wd': weight_decay},
- force_init=force_init)
- elif optimizer == "adam":
- module.init_optimizer(kvstore='device',
- optimizer=optimizer,
- optimizer_params={'lr_scheduler': lr_scheduler,
- #'momentum': momentum,
- 'clip_gradient': clip_gradient,
- 'wd': weight_decay},
- force_init=force_init)
- else:
- raise Exception('Supported optimizers are sgd and adam. If you want to implement others define them in train.py')
+ optimizer_params = {'lr_scheduler': lr_scheduler,
+ 'clip_gradient': clip_gradient,
+ 'wd': weight_decay}
+ optimizer_params.update(optimizer_params_dictionary)
+ module.init_optimizer(kvstore=kvstore_option,
+ optimizer=optimizer,
+ optimizer_params=optimizer_params,
+ force_init=force_init)
if mode == "train":
reset_optimizer(force_init=True)
else:
reset_optimizer(force_init=False)
+ data_train.reset()
+ data_train.is_first_epoch = True
#tensorboard setting
tblog_dir = args.config.get('common', 'tensorboard_log_dir')
summary_writer = SummaryWriter(tblog_dir)
+
while True:
if n_epoch >= num_epoch:
break
-
loss_metric.reset()
log.info('---------train---------')
for nbatch, data_batch in enumerate(data_train):
-
module.forward_backward(data_batch)
module.update()
# tensorboard setting
@@ -136,6 +151,7 @@ def reset_optimizer(force_init=False):
assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric'
data_train.reset()
+ data_train.is_first_epoch = False
# tensorboard setting
train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value()