From 016d29fb05a0c822ed58bf79b748c483290b1254 Mon Sep 17 00:00:00 2001 From: Soonhwan-Kwon Date: Mon, 10 Jul 2017 13:14:28 +0900 Subject: [PATCH] add bucketing/batchnorm and improved performance for speech_recognition example --- example/speech_recognition/README.md | 15 + example/speech_recognition/arch_deepspeech.py | 284 ++++++++++------- example/speech_recognition/deepspeech.cfg | 48 ++- example/speech_recognition/default.cfg | 50 ++- example/speech_recognition/main.py | 300 +++++++++++------- .../speech_recognition/stt_datagenerator.py | 90 ++++-- example/speech_recognition/stt_io_iter.py | 9 +- .../speech_recognition/stt_layer_batchnorm.py | 8 +- example/speech_recognition/stt_layer_conv.py | 12 +- example/speech_recognition/stt_layer_fc.py | 28 +- example/speech_recognition/stt_layer_gru.py | 70 ++-- example/speech_recognition/stt_layer_lstm.py | 82 +++-- example/speech_recognition/stt_metric.py | 11 +- example/speech_recognition/stt_utils.py | 7 +- example/speech_recognition/train.py | 74 +++-- 15 files changed, 685 insertions(+), 403 deletions(-) diff --git a/example/speech_recognition/README.md b/example/speech_recognition/README.md index 69961b1bdc5c..00d166602403 100644 --- a/example/speech_recognition/README.md +++ b/example/speech_recognition/README.md @@ -123,3 +123,18 @@ The new file should implement two functions, prepare_data() and arch(), for buil Run the following line after preparing the files.
python main.py --configfile custom.cfg --archfile arch_custom
+ +*** +## **Further more** +You can prepare full LibriSpeech dataset by following the instruction on https://github.com/baidu-research/ba-dls-deepspeech +**Change flac_to_wav.sh script of baidu to flac_to_wav.sh in repository to avoid bug** +```bash +git clone https://github.com/baidu-research/ba-dls-deepspeech +cd ba-dls-deepspeech +./download.sh +cp -f /path/to/example/flac_to_wav.sh ./ +./flac_to_wav.sh +python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/train-clean-100 train_corpus.json +python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/dev-clean validation_corpus.json +python create_desc_json.py /path/to/ba-dls-deepspeech/LibriSpeech/test-clean test_corpus.json +``` diff --git a/example/speech_recognition/arch_deepspeech.py b/example/speech_recognition/arch_deepspeech.py index 92f1002a2f01..4288b246f3e5 100644 --- a/example/speech_recognition/arch_deepspeech.py +++ b/example/speech_recognition/arch_deepspeech.py @@ -1,6 +1,12 @@ +# pylint: disable=C0111, too-many-statements, too-many-locals +# pylint: too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme +# pylint: disable=superfluous-parens, no-member, invalid-name +""" +architecture file for deep speech 2 model +""" import json import math - +import argparse import mxnet as mx from stt_layer_batchnorm import batchnorm @@ -13,6 +19,9 @@ def prepare_data(args): + """ + set atual shape of data + """ rnn_type = args.config.get("arch", "rnn_type") num_rnn_layer = args.config.getint("arch", "num_rnn_layer") num_hidden_rnn_list = json.loads(args.config.get("arch", "num_hidden_rnn_list")) @@ -20,26 +29,29 @@ def prepare_data(args): batch_size = args.config.getint("common", "batch_size") if rnn_type == 'lstm': - init_c = [('l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l])) for l in range(num_rnn_layer)] - init_h = [('l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in range(num_rnn_layer)] + init_c = [('l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l])) + for l in range(num_rnn_layer)] + init_h = [('l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) + for l in range(num_rnn_layer)] elif rnn_type == 'bilstm': - forward_init_c = [('forward_l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l])) for l in - range(num_rnn_layer)] - backward_init_c = [('backward_l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l])) for l in - range(num_rnn_layer)] + forward_init_c = [('forward_l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l])) + for l in range(num_rnn_layer)] + backward_init_c = [('backward_l%d_init_c' % l, (batch_size, num_hidden_rnn_list[l])) + for l in range(num_rnn_layer)] init_c = forward_init_c + backward_init_c - forward_init_h = [('forward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in - range(num_rnn_layer)] - backward_init_h = [('backward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in - range(num_rnn_layer)] + forward_init_h = [('forward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) + for l in range(num_rnn_layer)] + backward_init_h = [('backward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) + for l in range(num_rnn_layer)] init_h = forward_init_h + backward_init_h elif rnn_type == 'gru': - init_h = [('l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in range(num_rnn_layer)] + init_h = [('l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) + for l in range(num_rnn_layer)] elif rnn_type == 'bigru': - forward_init_h = [('forward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in - range(num_rnn_layer)] - backward_init_h = [('backward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) for l in - range(num_rnn_layer)] + forward_init_h = [('forward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) + for l in range(num_rnn_layer)] + backward_init_h = [('backward_l%d_init_h' % l, (batch_size, num_hidden_rnn_list[l])) + for l in range(num_rnn_layer)] init_h = forward_init_h + backward_init_h else: raise Exception('network type should be one of the lstm,bilstm,gru,bigru') @@ -51,115 +63,143 @@ def prepare_data(args): return init_states -def arch(args): - mode = args.config.get("common", "mode") - if mode == "train": - channel_num = args.config.getint("arch", "channel_num") - conv_layer1_filter_dim = tuple(json.loads(args.config.get("arch", "conv_layer1_filter_dim"))) - conv_layer1_stride = tuple(json.loads(args.config.get("arch", "conv_layer1_stride"))) - conv_layer2_filter_dim = tuple(json.loads(args.config.get("arch", "conv_layer2_filter_dim"))) - conv_layer2_stride = tuple(json.loads(args.config.get("arch", "conv_layer2_stride"))) - - rnn_type = args.config.get("arch", "rnn_type") - num_rnn_layer = args.config.getint("arch", "num_rnn_layer") - num_hidden_rnn_list = json.loads(args.config.get("arch", "num_hidden_rnn_list")) - - is_batchnorm = args.config.getboolean("arch", "is_batchnorm") - - seq_len = args.config.getint('arch', 'max_t_count') - num_label = args.config.getint('arch', 'max_label_length') - - num_rear_fc_layers = args.config.getint("arch", "num_rear_fc_layers") - num_hidden_rear_fc_list = json.loads(args.config.get("arch", "num_hidden_rear_fc_list")) - act_type_rear_fc_list = json.loads(args.config.get("arch", "act_type_rear_fc_list")) - # model symbol generation - # input preparation - data = mx.sym.Variable('data') - label = mx.sym.Variable('label') - - net = mx.sym.Reshape(data=data, shape=(-4, -1, 1, 0, 0)) - net = conv(net=net, - channels=channel_num, - filter_dimension=conv_layer1_filter_dim, - stride=conv_layer1_stride, - no_bias=is_batchnorm - ) - if is_batchnorm: - # batch norm normalizes axis 1 - net = batchnorm(net) - - net = conv(net=net, - channels=channel_num, - filter_dimension=conv_layer2_filter_dim, - stride=conv_layer2_stride, - no_bias=is_batchnorm - ) - if is_batchnorm: - # batch norm normalizes axis 1 - net = batchnorm(net) - net = mx.sym.transpose(data=net, axes=(0, 2, 1, 3)) - net = mx.sym.Reshape(data=net, shape=(0, 0, -3)) - seq_len_after_conv_layer1 = int( - math.floor((seq_len - conv_layer1_filter_dim[0]) / conv_layer1_stride[0])) + 1 - seq_len_after_conv_layer2 = int( - math.floor((seq_len_after_conv_layer1 - conv_layer2_filter_dim[0]) / conv_layer2_stride[0])) + 1 - net = slice_symbol_to_seq_symobls(net=net, seq_len=seq_len_after_conv_layer2, axis=1) - if rnn_type == "bilstm": - net = bi_lstm_unroll(net=net, +def arch(args, seq_len=None): + """ + define deep speech 2 network + """ + if isinstance(args, argparse.Namespace): + mode = args.config.get("common", "mode") + is_bucketing = args.config.getboolean("arch", "is_bucketing") + if mode == "train" or is_bucketing: + channel_num = args.config.getint("arch", "channel_num") + conv_layer1_filter_dim = \ + tuple(json.loads(args.config.get("arch", "conv_layer1_filter_dim"))) + conv_layer1_stride = tuple(json.loads(args.config.get("arch", "conv_layer1_stride"))) + conv_layer2_filter_dim = \ + tuple(json.loads(args.config.get("arch", "conv_layer2_filter_dim"))) + conv_layer2_stride = tuple(json.loads(args.config.get("arch", "conv_layer2_stride"))) + + rnn_type = args.config.get("arch", "rnn_type") + num_rnn_layer = args.config.getint("arch", "num_rnn_layer") + num_hidden_rnn_list = json.loads(args.config.get("arch", "num_hidden_rnn_list")) + + is_batchnorm = args.config.getboolean("arch", "is_batchnorm") + + if seq_len is None: + seq_len = args.config.getint('arch', 'max_t_count') + + num_label = args.config.getint('arch', 'max_label_length') + + num_rear_fc_layers = args.config.getint("arch", "num_rear_fc_layers") + num_hidden_rear_fc_list = json.loads(args.config.get("arch", "num_hidden_rear_fc_list")) + act_type_rear_fc_list = json.loads(args.config.get("arch", "act_type_rear_fc_list")) + # model symbol generation + # input preparation + data = mx.sym.Variable('data') + label = mx.sym.Variable('label') + + net = mx.sym.Reshape(data=data, shape=(-4, -1, 1, 0, 0)) + net = conv(net=net, + channels=channel_num, + filter_dimension=conv_layer1_filter_dim, + stride=conv_layer1_stride, + no_bias=is_batchnorm, + name='conv1') + if is_batchnorm: + # batch norm normalizes axis 1 + net = batchnorm(net, name="conv1_batchnorm") + + net = conv(net=net, + channels=channel_num, + filter_dimension=conv_layer2_filter_dim, + stride=conv_layer2_stride, + no_bias=is_batchnorm, + name='conv2') + if is_batchnorm: + # batch norm normalizes axis 1 + net = batchnorm(net, name="conv2_batchnorm") + + net = mx.sym.transpose(data=net, axes=(0, 2, 1, 3)) + net = mx.sym.Reshape(data=net, shape=(0, 0, -3)) + seq_len_after_conv_layer1 = int( + math.floor((seq_len - conv_layer1_filter_dim[0]) / conv_layer1_stride[0])) + 1 + seq_len_after_conv_layer2 = int( + math.floor((seq_len_after_conv_layer1 - conv_layer2_filter_dim[0]) + / conv_layer2_stride[0])) + 1 + net = slice_symbol_to_seq_symobls(net=net, seq_len=seq_len_after_conv_layer2, axis=1) + if rnn_type == "bilstm": + net = bi_lstm_unroll(net=net, + seq_len=seq_len_after_conv_layer2, + num_hidden_lstm_list=num_hidden_rnn_list, + num_lstm_layer=num_rnn_layer, + dropout=0., + is_batchnorm=is_batchnorm, + is_bucketing=is_bucketing) + elif rnn_type == "gru": + net = gru_unroll(net=net, seq_len=seq_len_after_conv_layer2, - num_hidden_lstm_list=num_hidden_rnn_list, - num_lstm_layer=num_rnn_layer, + num_hidden_gru_list=num_hidden_rnn_list, + num_gru_layer=num_rnn_layer, dropout=0., - is_batchnorm=is_batchnorm) - elif rnn_type == "gru": - net = gru_unroll(net=net, - seq_len=seq_len_after_conv_layer2, - num_hidden_gru_list=num_hidden_rnn_list, - num_gru_layer=num_rnn_layer, - dropout=0., - is_batchnorm=is_batchnorm) - elif rnn_type == "bigru": - net = bi_gru_unroll(net=net, + is_batchnorm=is_batchnorm, + is_bucketing=is_bucketing) + elif rnn_type == "bigru": + net = bi_gru_unroll(net=net, + seq_len=seq_len_after_conv_layer2, + num_hidden_gru_list=num_hidden_rnn_list, + num_gru_layer=num_rnn_layer, + dropout=0., + is_batchnorm=is_batchnorm, + is_bucketing=is_bucketing) + else: + raise Exception('rnn_type should be one of the followings, bilstm,gru,bigru') + + # rear fc layers + net = sequence_fc(net=net, seq_len=seq_len_after_conv_layer2, + num_layer=num_rear_fc_layers, prefix="rear", + num_hidden_list=num_hidden_rear_fc_list, + act_type_list=act_type_rear_fc_list, + is_batchnorm=is_batchnorm) + # warpctc layer + net = warpctc_layer(net=net, seq_len=seq_len_after_conv_layer2, - num_hidden_gru_list=num_hidden_rnn_list, - num_gru_layer=num_rnn_layer, - dropout=0., - is_batchnorm=is_batchnorm) + label=label, + num_label=num_label, + character_classes_count= + (args.config.getint('arch', 'n_classes') + 1)) + args.config.set('arch', 'max_t_count', str(seq_len_after_conv_layer2)) + return net + elif mode == 'load' or mode == 'predict': + conv_layer1_filter_dim = \ + tuple(json.loads(args.config.get("arch", "conv_layer1_filter_dim"))) + conv_layer1_stride = tuple(json.loads(args.config.get("arch", "conv_layer1_stride"))) + conv_layer2_filter_dim = \ + tuple(json.loads(args.config.get("arch", "conv_layer2_filter_dim"))) + conv_layer2_stride = tuple(json.loads(args.config.get("arch", "conv_layer2_stride"))) + if seq_len is None: + seq_len = args.config.getint('arch', 'max_t_count') + seq_len_after_conv_layer1 = int( + math.floor((seq_len - conv_layer1_filter_dim[0]) / conv_layer1_stride[0])) + 1 + seq_len_after_conv_layer2 = int( + math.floor((seq_len_after_conv_layer1 - conv_layer2_filter_dim[0]) + / conv_layer2_stride[0])) + 1 + + args.config.set('arch', 'max_t_count', str(seq_len_after_conv_layer2)) else: - raise Exception('rnn_type should be one of the followings, bilstm,gru,bigru') - - # rear fc layers - net = sequence_fc(net=net, seq_len=seq_len_after_conv_layer2, num_layer=num_rear_fc_layers, prefix="rear", - num_hidden_list=num_hidden_rear_fc_list, act_type_list=act_type_rear_fc_list, - is_batchnorm=is_batchnorm) - if is_batchnorm: - hidden_all = [] - # batch norm normalizes axis 1 - for seq_index in range(seq_len_after_conv_layer2): - hidden = net[seq_index] - hidden = batchnorm(hidden) - hidden_all.append(hidden) - net = hidden_all - - # warpctc layer - net = warpctc_layer(net=net, - seq_len=seq_len_after_conv_layer2, - label=label, - num_label=num_label, - character_classes_count=(args.config.getint('arch', 'n_classes') + 1) - ) - args.config.set('arch', 'max_t_count', str(seq_len_after_conv_layer2)) - return net - else: - conv_layer1_filter_dim = tuple(json.loads(args.config.get("arch", "conv_layer1_filter_dim"))) - conv_layer1_stride = tuple(json.loads(args.config.get("arch", "conv_layer1_stride"))) - conv_layer2_filter_dim = tuple(json.loads(args.config.get("arch", "conv_layer2_filter_dim"))) - conv_layer2_stride = tuple(json.loads(args.config.get("arch", "conv_layer2_stride"))) - seq_len = args.config.getint('arch', 'max_t_count') - seq_len_after_conv_layer1 = int( - math.floor((seq_len - conv_layer1_filter_dim[0]) / conv_layer1_stride[0])) + 1 - seq_len_after_conv_layer2 = int( - math.floor((seq_len_after_conv_layer1 - conv_layer2_filter_dim[0]) / conv_layer2_stride[0])) + 1 - args.config.set('arch', 'max_t_count', str(seq_len_after_conv_layer2)) + raise Exception('mode must be the one of the followings - train,predict,load') + + +class BucketingArch(object): + def __init__(self, args): + self.args = args + def sym_gen(self, seq_len): + args = self.args + net = arch(args, seq_len) + init_states = prepare_data(args) + init_state_names = [x[0] for x in init_states] + init_state_names.insert(0, 'data') + return net, init_state_names, ('label',) + def get_sym_gen(self): + return self.sym_gen diff --git a/example/speech_recognition/deepspeech.cfg b/example/speech_recognition/deepspeech.cfg index 13cf578c679a..4f0f49699771 100644 --- a/example/speech_recognition/deepspeech.cfg +++ b/example/speech_recognition/deepspeech.cfg @@ -3,23 +3,27 @@ mode = train #ex: gpu0,gpu1,gpu2,gpu3 context = gpu0,gpu1,gpu2 +#context = gpu0 # checkpoint prefix, check point will be saved under checkpoints folder with prefix -prefix = deep +prefix = deep_bucket # when mode is load or predict, model will be loaded from the file name with model_file under checkpoints -model_file = deepspeechn_epoch1n_batch-0009 +model_file = deep_bucketn_epoch0n_batch-0018 batch_size = 12 +#batch_size=4 # log will be saved by the log_filename -log_filename = deep.log +log_filename = deep_bucket.log # checkpoint set n to save checkpoints after n epoch save_checkpoint_every_n_epoch = 1 -save_checkpoint_every_n_batch = 1000 +save_checkpoint_every_n_batch = 3000 is_bi_graphemes = True -tensorboard_log_dir = tblog/deep +tensorboard_log_dir = tblog/deep_bucket # if random_seed is -1 then it gets random seed from timestamp mx_random_seed = -1 random_seed = -1 +kvstore_option = device [data] +max_duration = 16.0 train_json = ./train_corpus_all.json test_json = ./test_corpus.json val_json = ./test_corpus.json @@ -50,31 +54,49 @@ rnn_type = bigru #vanilla_lstm or fc_lstm (no effect when network_type is gru, bigru) lstm_type = fc_lstm is_batchnorm = True +is_bucketing = True +buckets = [200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600] [train] num_epoch = 70 learning_rate = 0.0003 # constant learning rate annealing by factor learning_rate_annealing = 1.1 -# supports only sgd and adam -optimizer = sgd -# for sgd -momentum = 0.9 -# set to 0 to disable gradient clipping -clip_gradient = 0 initializer = Xavier init_scale = 2 factor_type = in -weight_decay = 0. # show progress every how nth batches show_every = 100 save_optimizer_states = True -normalize_target_k = 13000 +normalize_target_k = 100000 # overwrite meta files(feats_mean,feats_std,unicode_en_baidu_bi_graphemes.csv) overwrite_meta_files = True +overwrite_bi_graphemes_dictionary = False +# save feature extracted from soundfile as csvfile, it can take too much disk space +save_feature_as_csvfile = False enable_logging_train_metric = True enable_logging_validation_metric = True [load] load_optimizer_states = True is_start_from_batch = True + +[optimizer] +optimizer = sgd +# define parameters for optimizer +# optimizer_params_dictionary should use " not ' as string wrapper +# sgd/nag +optimizer_params_dictionary={"momentum":0.9} +# dcasgd +# optimizer_params_dictionary={"momentum":0.9, "lamda":1.0} +# adam +# optimizer_params_dictionary={"beta1":0.9,"beta2":0.999} +# adagrad +# optimizer_params_dictionary={"eps":1e-08} +# rmsprop +# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08} +# adadelta +# optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08} +# set to 0 to disable gradient clipping +clip_gradient = 100 +weight_decay = 0. diff --git a/example/speech_recognition/default.cfg b/example/speech_recognition/default.cfg index 853a04aebbdd..127c492b6166 100644 --- a/example/speech_recognition/default.cfg +++ b/example/speech_recognition/default.cfg @@ -6,20 +6,22 @@ context = gpu0 # checkpoint prefix, check point will be saved under checkpoints folder with prefix prefix = test_fc # when mode is load or predict, model will be loaded from the file name with model_file under checkpoints -model_file = test_fc-0001 +model_file = test_fc-0040 batch_size = 2 # log will be saved by the log_filename log_filename = test.log # checkpoint set n to save checkpoints after n epoch -save_checkpoint_every_n_epoch = 1 +save_checkpoint_every_n_epoch = 20 save_checkpoint_every_n_batch = 1000 is_bi_graphemes = False tensorboard_log_dir = tblog/libri_sample # if random_seed is -1 then it gets random seed from timestamp -mx_random_seed = -1 -random_seed = -1 +mx_random_seed = 1234 +random_seed = 1234 +kvstore_option = device [data] +max_duration = 16.0 train_json = ./Libri_sample.json test_json = ./Libri_sample.json val_json = ./Libri_sample.json @@ -37,8 +39,8 @@ conv_layer1_stride = [2, 2] conv_layer2_filter_dim = [11, 21] conv_layer2_stride = [1, 2] -num_rnn_layer = 3 -num_hidden_rnn_list = [1760, 1760, 1760] +num_rnn_layer = 1 +num_hidden_rnn_list = [1760] num_hidden_proj = 0 num_rear_fc_layers = 0 @@ -50,33 +52,49 @@ rnn_type = bigru #vanilla_lstm or fc_lstm (no effect when network_type is gru, bigru) lstm_type = fc_lstm is_batchnorm = True +is_bucketing = False +buckets = [] [train] -num_epoch = 70 - +num_epoch = 50 learning_rate = 0.005 # constant learning rate annealing by factor learning_rate_annealing = 1.1 -# supports only sgd and adam -optimizer = adam -# for sgd -momentum = 0.9 -# set to 0 to disable gradient clipping -clip_gradient = 0 - initializer = Xavier init_scale = 2 factor_type = in -weight_decay = 0.00001 # show progress every nth batches show_every = 1 save_optimizer_states = True normalize_target_k = 2 # overwrite meta files(feats_mean,feats_std,unicode_en_baidu_bi_graphemes.csv) overwrite_meta_files = True +overwrite_bi_graphemes_dictionary = False +# save feature extracted from soundfile as csvfile, it can take too much disk space +save_feature_as_csvfile = False enable_logging_train_metric = True enable_logging_validation_metric = True [load] load_optimizer_states = True is_start_from_batch = False + +[optimizer] +optimizer = adam +# define parameters for optimizer +# optimizer_params_dictionary should use " not ' as string wrapper +# sgd/nag +# optimizer_params_dictionary={"momentum":0.9} +# dcasgd +# optimizer_params_dictionary={"momentum":0.9, "lamda":1.0} +# adam +optimizer_params_dictionary={"beta1":0.9,"beta2":0.999} +# adagrad +# optimizer_params_dictionary={"eps":1e-08} +# rmsprop +# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08} +# adadelta +# optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08} +# set to 0 to disable gradient clipping +clip_gradient = 0 +weight_decay = 0. diff --git a/example/speech_recognition/main.py b/example/speech_recognition/main.py index 398a8a537e01..a425e0a8ab40 100644 --- a/example/speech_recognition/main.py +++ b/example/speech_recognition/main.py @@ -1,34 +1,32 @@ +import json +import os import sys - -sys.path.insert(0, "../../python") +from collections import namedtuple +from datetime import datetime from config_util import parse_args, parse_contexts, generate_file_path from train import do_training import mxnet as mx from stt_io_iter import STTIter from label_util import LabelUtil from log_util import LogUtil - import numpy as np from stt_datagenerator import DataGenerator from stt_metric import STTMetric -from datetime import datetime from stt_bi_graphemes_util import generate_bi_graphemes_dictionary -######################################## -########## FOR JUPYTER NOTEBOOK -import os +from stt_bucketing_module import STTBucketingModule +from stt_io_bucketingiter import BucketSTTIter +sys.path.insert(0, "../../python") # os.environ['MXNET_ENGINE_TYPE'] = "NaiveEngine" os.environ['MXNET_ENGINE_TYPE'] = "ThreadedEnginePerDevice" os.environ['MXNET_ENABLE_GPU_P2P'] = "0" - class WHCS: width = 0 height = 0 channel = 0 stride = 0 - class ConfigLogger(object): def __init__(self, log): self.__log = log @@ -42,9 +40,25 @@ def write(self, data): line = data.strip() self.__log.info(line) +def load_labelutil(labelUtil, is_bi_graphemes, language="en"): + if language == "en": + if is_bi_graphemes: + try: + labelUtil.load_unicode_set("resources/unicodemap_en_baidu_bi_graphemes.csv") + except: + raise Exception("There is no resources/unicodemap_en_baidu_bi_graphemes.csv." + + " Please set overwrite_bi_graphemes_dictionary True at train section") + else: + labelUtil.load_unicode_set("resources/unicodemap_en_baidu.csv") + else: + raise Exception("Error: Language Type: %s" % language) + + def load_data(args): mode = args.config.get('common', 'mode') + if mode not in ['train', 'predict', 'load']: + raise Exception('mode must be the one of the followings - train,predict,load') batch_size = args.config.getint('common', 'batch_size') whcs = WHCS() @@ -56,101 +70,78 @@ def load_data(args): model_name = args.config.get('common', 'prefix') is_bi_graphemes = args.config.getboolean('common', 'is_bi_graphemes') overwrite_meta_files = args.config.getboolean('train', 'overwrite_meta_files') + overwrite_bi_graphemes_dictionary = args.config.getboolean('train', 'overwrite_bi_graphemes_dictionary') + max_duration = args.config.getfloat('data', 'max_duration') language = args.config.get('data', 'language') - is_bi_graphemes = args.config.getboolean('common', 'is_bi_graphemes') + log = LogUtil().getlogger() labelUtil = LabelUtil.getInstance() - if language == "en": - if is_bi_graphemes: - try: - labelUtil.load_unicode_set("resources/unicodemap_en_baidu_bi_graphemes.csv") - except: - raise Exception("There is no resources/unicodemap_en_baidu_bi_graphemes.csv. Please set overwrite_meta_files at train section True") - else: - labelUtil.load_unicode_set("resources/unicodemap_en_baidu.csv") - else: - raise Exception("Error: Language Type: %s" % language) - args.config.set('arch', 'n_classes', str(labelUtil.get_count())) - - if mode == 'predict': - test_json = args.config.get('data', 'test_json') - datagen = DataGenerator(save_dir=save_dir, model_name=model_name) - datagen.load_train_data(test_json) - datagen.get_meta_from_file(np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), - np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) - elif mode =="train" or mode == "load": + if mode == "train" or mode == "load": data_json = args.config.get('data', 'train_json') val_json = args.config.get('data', 'val_json') datagen = DataGenerator(save_dir=save_dir, model_name=model_name) - datagen.load_train_data(data_json) - #test bigramphems - - if overwrite_meta_files and is_bi_graphemes: - generate_bi_graphemes_dictionary(datagen.train_texts) - + datagen.load_train_data(data_json, max_duration=max_duration) + if is_bi_graphemes: + if not os.path.isfile("resources/unicodemap_en_baidu_bi_graphemes.csv") or overwrite_bi_graphemes_dictionary: + load_labelutil(labelUtil=labelUtil, is_bi_graphemes=False, language=language) + generate_bi_graphemes_dictionary(datagen.train_texts) + load_labelutil(labelUtil=labelUtil, is_bi_graphemes=is_bi_graphemes, language=language) args.config.set('arch', 'n_classes', str(labelUtil.get_count())) if mode == "train": if overwrite_meta_files: + log.info("Generate mean and std from samples") normalize_target_k = args.config.getint('train', 'normalize_target_k') datagen.sample_normalize(normalize_target_k, True) else: - datagen.get_meta_from_file(np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), - np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) - datagen.load_validation_data(val_json) + log.info("Read mean and std from meta files") + datagen.get_meta_from_file( + np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), + np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) + datagen.load_validation_data(val_json, max_duration=max_duration) elif mode == "load": # get feat_mean and feat_std to normalize dataset - datagen.get_meta_from_file(np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), - np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) - datagen.load_validation_data(val_json) - else: - raise Exception( - 'Define mode in the cfg file first. train or predict or load can be the candidate for the mode.') + datagen.get_meta_from_file( + np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), + np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) + datagen.load_validation_data(val_json, max_duration=max_duration) + elif mode == 'predict': + test_json = args.config.get('data', 'test_json') + datagen = DataGenerator(save_dir=save_dir, model_name=model_name) + datagen.load_train_data(test_json, max_duration=max_duration) + labelutil = load_labelutil(labelUtil, is_bi_graphemes, language="en") + args.config.set('arch', 'n_classes', str(labelUtil.get_count())) + datagen.get_meta_from_file( + np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), + np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) is_batchnorm = args.config.getboolean('arch', 'is_batchnorm') - if batch_size == 1 and is_batchnorm: + if batch_size == 1 and is_batchnorm and (mode == 'train' or mode == 'load'): raise Warning('batch size 1 is too small for is_batchnorm') # sort file paths by its duration in ascending order to implement sortaGrad - if mode == "train" or mode == "load": max_t_count = datagen.get_max_seq_length(partition="train") - max_label_length = datagen.get_max_label_length(partition="train",is_bi_graphemes=is_bi_graphemes) + max_label_length = \ + datagen.get_max_label_length(partition="train", is_bi_graphemes=is_bi_graphemes) elif mode == "predict": max_t_count = datagen.get_max_seq_length(partition="test") - max_label_length = datagen.get_max_label_length(partition="test",is_bi_graphemes=is_bi_graphemes) - else: - raise Exception( - 'Define mode in the cfg file first. train or predict or load can be the candidate for the mode.') + max_label_length = \ + datagen.get_max_label_length(partition="test", is_bi_graphemes=is_bi_graphemes) args.config.set('arch', 'max_t_count', str(max_t_count)) args.config.set('arch', 'max_label_length', str(max_label_length)) from importlib import import_module prepare_data_template = import_module(args.config.get('arch', 'arch_file')) init_states = prepare_data_template.prepare_data(args) - if mode == "train": - sort_by_duration = True - else: - sort_by_duration = False - - data_loaded = STTIter(partition="train", - count=datagen.count, - datagen=datagen, - batch_size=batch_size, - num_label=max_label_length, - init_states=init_states, - seq_length=max_t_count, - width=whcs.width, - height=whcs.height, - sort_by_duration=sort_by_duration, - is_bi_graphemes=is_bi_graphemes) - - if mode == 'predict': - return data_loaded, args - else: - validation_loaded = STTIter(partition="validation", - count=datagen.val_count, + sort_by_duration = (mode == "train") + is_bucketing = args.config.getboolean('arch', 'is_bucketing') + save_feature_as_csvfile = args.config.getboolean('train', 'save_feature_as_csvfile') + if is_bucketing: + buckets = json.loads(args.config.get('arch', 'buckets')) + data_loaded = BucketSTTIter(partition="train", + count=datagen.count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, @@ -158,37 +149,91 @@ def load_data(args): seq_length=max_t_count, width=whcs.width, height=whcs.height, - sort_by_duration=False, - is_bi_graphemes=is_bi_graphemes) + sort_by_duration=sort_by_duration, + is_bi_graphemes=is_bi_graphemes, + buckets=buckets, + save_feature_as_csvfile=save_feature_as_csvfile) + else: + data_loaded = STTIter(partition="train", + count=datagen.count, + datagen=datagen, + batch_size=batch_size, + num_label=max_label_length, + init_states=init_states, + seq_length=max_t_count, + width=whcs.width, + height=whcs.height, + sort_by_duration=sort_by_duration, + is_bi_graphemes=is_bi_graphemes, + save_feature_as_csvfile=save_feature_as_csvfile) + + if mode == 'train' or mode == 'load': + if is_bucketing: + validation_loaded = BucketSTTIter(partition="validation", + count=datagen.val_count, + datagen=datagen, + batch_size=batch_size, + num_label=max_label_length, + init_states=init_states, + seq_length=max_t_count, + width=whcs.width, + height=whcs.height, + sort_by_duration=False, + is_bi_graphemes=is_bi_graphemes, + buckets=buckets, + save_feature_as_csvfile=save_feature_as_csvfile) + else: + validation_loaded = STTIter(partition="validation", + count=datagen.val_count, + datagen=datagen, + batch_size=batch_size, + num_label=max_label_length, + init_states=init_states, + seq_length=max_t_count, + width=whcs.width, + height=whcs.height, + sort_by_duration=False, + is_bi_graphemes=is_bi_graphemes, + save_feature_as_csvfile=save_feature_as_csvfile) return data_loaded, validation_loaded, args + elif mode == 'predict': + return data_loaded, args def load_model(args, contexts, data_train): # load model from model_name prefix and epoch of model_num_epoch with gpu contexts of contexts mode = args.config.get('common', 'mode') load_optimizer_states = args.config.getboolean('load', 'load_optimizer_states') - is_start_from_batch = args.config.getboolean('load','is_start_from_batch') + is_start_from_batch = args.config.getboolean('load', 'is_start_from_batch') from importlib import import_module symbol_template = import_module(args.config.get('arch', 'arch_file')) - model_loaded = symbol_template.arch(args) + is_bucketing = args.config.getboolean('arch', 'is_bucketing') if mode == 'train': + if is_bucketing: + bucketing_arch = symbol_template.BucketingArch(args) + model_loaded = bucketing_arch.get_sym_gen() + else: + model_loaded = symbol_template.arch(args) model_num_epoch = None - else: + elif mode == 'load' or mode == 'predict': model_file = args.config.get('common', 'model_file') model_name = os.path.splitext(model_file)[0] - model_num_epoch = int(model_name[-4:]) + if is_bucketing: + bucketing_arch = symbol_template.BucketingArch(args) + model_loaded = bucketing_arch.get_sym_gen() + else: + model_path = 'checkpoints/' + str(model_name[:-5]) - model_path = 'checkpoints/' + str(model_name[:-5]) - - data_names = [x[0] for x in data_train.provide_data] - label_names = [x[0] for x in data_train.provide_label] + data_names = [x[0] for x in data_train.provide_data] + label_names = [x[0] for x in data_train.provide_label] - model_loaded = mx.module.Module.load(prefix=model_path, epoch=model_num_epoch, context=contexts, - data_names=data_names, label_names=label_names, - load_optimizer_states=load_optimizer_states) + model_loaded = mx.module.Module.load( + prefix=model_path, epoch=model_num_epoch, context=contexts, + data_names=data_names, label_names=label_names, + load_optimizer_states=load_optimizer_states) if is_start_from_batch: import re model_num_epoch = int(re.findall('\d+', model_file)[0]) @@ -198,7 +243,8 @@ def load_model(args, contexts, data_train): if __name__ == '__main__': if len(sys.argv) <= 1: - raise Exception('cfg file path must be provided. ex)python main.py --configfile examplecfg.cfg') + raise Exception('cfg file path must be provided. ' + + 'ex)python main.py --configfile examplecfg.cfg') args = parse_args(sys.argv[1]) # set parameters from cfg file # give random seed @@ -206,9 +252,9 @@ def load_model(args, contexts, data_train): mx_random_seed = args.config.getint('common', 'mx_random_seed') # random seed for shuffling data list if random_seed != -1: - random.seed(random_seed) + np.random.seed(random_seed) # set mx.random.seed to give seed for parameter initialization - if mx_random_seed !=-1: + if mx_random_seed != -1: mx.random.seed(mx_random_seed) else: mx.random.seed(hash(datetime.now())) @@ -220,22 +266,23 @@ def load_model(args, contexts, data_train): mode = args.config.get('common', 'mode') if mode not in ['train', 'predict', 'load']: raise Exception( - 'Define mode in the cfg file first. train or predict or load can be the candidate for the mode.') + 'Define mode in the cfg file first. ' + + 'train or predict or load can be the candidate for the mode.') # get meta file where character to number conversions are defined contexts = parse_contexts(args) num_gpu = len(contexts) batch_size = args.config.getint('common', 'batch_size') - # check the number of gpus is positive divisor of the batch size for data parallel if batch_size % num_gpu != 0: raise Exception('num_gpu should be positive divisor of batch_size') - - if mode == "predict": - data_train, args = load_data(args) - elif mode == "train" or mode == "load": + if mode == "train" or mode == "load": data_train, data_val, args = load_data(args) + elif mode == "predict": + data_train, args = load_data(args) + is_batchnorm = args.config.getboolean('arch', 'is_batchnorm') + is_bucketing = args.config.getboolean('arch', 'is_bucketing') # log current config config_logger = ConfigLogger(log) @@ -243,28 +290,63 @@ def load_model(args, contexts, data_train): # load model model_loaded, model_num_epoch = load_model(args, contexts, data_train) - # if mode is 'train', it trains the model if mode == 'train': - data_names = [x[0] for x in data_train.provide_data] - label_names = [x[0] for x in data_train.provide_label] - module = mx.mod.Module(model_loaded, context=contexts, data_names=data_names, label_names=label_names) + if is_bucketing: + module = STTBucketingModule( + sym_gen=model_loaded, + default_bucket_key=data_train.default_bucket_key, + context=contexts + ) + else: + data_names = [x[0] for x in data_train.provide_data] + label_names = [x[0] for x in data_train.provide_label] + module = mx.mod.Module(model_loaded, context=contexts, + data_names=data_names, label_names=label_names) do_training(args=args, module=module, data_train=data_train, data_val=data_val) # if mode is 'load', it loads model from the checkpoint and continues the training. elif mode == 'load': - do_training(args=args, module=model_loaded, data_train=data_train, data_val=data_val, begin_epoch=model_num_epoch+1) + do_training(args=args, module=model_loaded, data_train=data_train, data_val=data_val, + begin_epoch=model_num_epoch + 1) # if mode is 'predict', it predict label from the input by the input model elif mode == 'predict': # predict through data - model_loaded.bind(for_training=False, data_shapes=data_train.provide_data, - label_shapes=data_train.provide_label) + if is_bucketing: + max_t_count = args.config.getint('arch', 'max_t_count') + load_optimizer_states = args.config.getboolean('load', 'load_optimizer_states') + model_file = args.config.get('common', 'model_file') + model_name = os.path.splitext(model_file)[0] + model_num_epoch = int(model_name[-4:]) + + model_path = 'checkpoints/' + str(model_name[:-5]) + model = STTBucketingModule( + sym_gen=model_loaded, + default_bucket_key=data_train.default_bucket_key, + context=contexts + ) + + model.bind(data_shapes=data_train.provide_data, + label_shapes=data_train.provide_label, + for_training=True) + _, arg_params, aux_params = mx.model.load_checkpoint(model_path, model_num_epoch) + model.set_params(arg_params, aux_params) + model_loaded = model + else: + model_loaded.bind(for_training=False, data_shapes=data_train.provide_data, + label_shapes=data_train.provide_label) max_t_count = args.config.getint('arch', 'max_t_count') - eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=max_t_count) - is_batchnorm = args.config.getboolean('arch', 'is_batchnorm') - if is_batchnorm : + eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu) + if is_batchnorm: for nbatch, data_batch in enumerate(data_train): - # when is_train = False it leads to high cer when batch_norm - model_loaded.forward(data_batch, is_train=True) + model_loaded.forward(data_batch, is_train=False) model_loaded.update_metric(eval_metric, data_batch.label) - else : - model_loaded.score(eval_data=data_train, num_batch=None, eval_metric=eval_metric, reset=True) + else: + #model_loaded.score(eval_data=data_train, num_batch=None, + # eval_metric=eval_metric, reset=True) + for nbatch, data_batch in enumerate(data_train): + model_loaded.forward(data_batch, is_train=False) + model_loaded.update_metric(eval_metric, data_batch.label) + else: + raise Exception( + 'Define mode in the cfg file first. ' + + 'train or predict or load can be the candidate for the mode') diff --git a/example/speech_recognition/stt_datagenerator.py b/example/speech_recognition/stt_datagenerator.py index 390de432e751..d2a7b4b5cbae 100644 --- a/example/speech_recognition/stt_datagenerator.py +++ b/example/speech_recognition/stt_datagenerator.py @@ -2,7 +2,6 @@ import json import random - import numpy as np from stt_utils import calc_feat_dim, spectrogram_from_file @@ -10,6 +9,7 @@ from log_util import LogUtil from label_util import LabelUtil from stt_bi_graphemes_util import generate_bi_graphemes_label +from multiprocessing import cpu_count, Process, Manager class DataGenerator(object): def __init__(self, save_dir, model_name, step=10, window=20, max_freq=8000, desc_file=None): @@ -32,7 +32,7 @@ def __init__(self, save_dir, model_name, step=10, window=20, max_freq=8000, desc # 1d 161 length of array filled with 1s self.feats_std = np.ones((self.feat_dim,)) self.max_input_length = 0 - self.max_length_list_in_batch =[] + self.max_length_list_in_batch = [] # 1d 161 length of array filled with random value #[0.0, 1.0) self.rng = random.Random() @@ -48,14 +48,15 @@ def get_meta_from_file(self, feats_mean, feats_std): self.feats_mean = feats_mean self.feats_std = feats_std - def featurize(self, audio_clip, overwrite=False): + def featurize(self, audio_clip, overwrite=False, save_feature_as_csvfile=False): """ For a given audio clip, calculate the log of its Fourier Transform Params: audio_clip(str): Path to the audio clip """ return spectrogram_from_file( audio_clip, step=self.step, window=self.window, - max_freq=self.max_freq, overwrite=overwrite) + max_freq=self.max_freq, overwrite=overwrite, + save_feature_as_csvfile=save_feature_as_csvfile) def load_metadata_from_desc_file(self, desc_file, partition='train', max_duration=16.0,): @@ -107,11 +108,11 @@ def load_metadata_from_desc_file(self, desc_file, partition='train', raise Exception("Invalid partition to load metadata. " "Must be train/validation/test") - def load_train_data(self, desc_file): - self.load_metadata_from_desc_file(desc_file, 'train') + def load_train_data(self, desc_file, max_duration): + self.load_metadata_from_desc_file(desc_file, 'train', max_duration=max_duration) - def load_validation_data(self, desc_file): - self.load_metadata_from_desc_file(desc_file, 'validation') + def load_validation_data(self, desc_file, max_duration): + self.load_metadata_from_desc_file(desc_file, 'validation', max_duration=max_duration) @staticmethod def sort_by_duration(durations, audio_paths, texts): @@ -146,10 +147,11 @@ def get_max_seq_length(self, partition): "Must be train/validation/test") max_duration_indexes = durations.index(max(durations)) max_seq_length = self.featurize(audio_paths[max_duration_indexes]).shape[0] - self.max_seq_length=max_seq_length + self.max_seq_length = max_seq_length return max_seq_length - def prepare_minibatch(self, audio_paths, texts, overwrite=False, is_bi_graphemes=False): + def prepare_minibatch(self, audio_paths, texts, overwrite=False, + is_bi_graphemes=False, seq_length=-1, save_feature_as_csvfile=False): """ Featurize a minibatch of audio, zero pad them and return a dictionary Params: audio_paths (list(str)): List of paths to audio files @@ -162,12 +164,15 @@ def prepare_minibatch(self, audio_paths, texts, overwrite=False, is_bi_graphemes # Features is a list of (timesteps, feature_dim) arrays # Calculate the features for each audio clip, as the log of the # Fourier Transform of the audio - features = [self.featurize(a, overwrite=overwrite) for a in audio_paths] + features = [self.featurize(a, overwrite=overwrite, save_feature_as_csvfile=save_feature_as_csvfile) for a in audio_paths] input_lengths = [f.shape[0] for f in features] feature_dim = features[0].shape[1] mb_size = len(features) # Pad all the inputs so that they are all the same length - x = np.zeros((mb_size, self.max_seq_length, feature_dim)) + if seq_length == -1: + x = np.zeros((mb_size, self.max_seq_length, feature_dim)) + else: + x = np.zeros((mb_size, seq_length, feature_dim)) y = np.zeros((mb_size, self.max_label_length)) labelUtil = LabelUtil.getInstance() label_lengths = [] @@ -199,34 +204,59 @@ def iterate_validation(self, minibatch_size=16): return self.iterate(self.val_audio_paths, self.val_texts, minibatch_size) + def preprocess_sample_normalize(self, threadIndex, audio_paths, overwrite, return_dict): + if len(audio_paths) > 0: + audio_clip = audio_paths[0] + feat = self.featurize(audio_clip=audio_clip, overwrite=overwrite) + feat_squared = np.square(feat) + count = float(feat.shape[0]) + dim = feat.shape[1] + if len(audio_paths) > 1: + for audio_path in audio_paths[1:]: + next_feat = self.featurize(audio_clip=audio_path, overwrite=overwrite) + next_feat_squared = np.square(next_feat) + feat_vertically_stacked = np.concatenate((feat, next_feat)).reshape(-1, dim) + feat = np.sum(feat_vertically_stacked, axis=0, keepdims=True) + feat_squared_vertically_stacked = np.concatenate( + (feat_squared, next_feat_squared)).reshape(-1, dim) + feat_squared = np.sum(feat_squared_vertically_stacked, axis=0, keepdims=True) + count += float(next_feat.shape[0]) + return_dict[threadIndex] = {'feat': feat, 'feat_squared': feat_squared, 'count': count} + def sample_normalize(self, k_samples=1000, overwrite=False): """ Estimate the mean and std of the features from the training set Params: k_samples (int): Use this number of samples for estimation """ + log = LogUtil().getlogger() + log.info("Calculating mean and std from samples") # if k_samples is negative then it goes through total dataset if k_samples < 0: - audio_paths_iter = iter(self.audio_paths) + audio_paths = self.audio_paths + # using sample else: k_samples = min(k_samples, len(self.train_audio_paths)) samples = self.rng.sample(self.train_audio_paths, k_samples) - audio_paths_iter = iter(samples) - audio_clip = audio_paths_iter.next() - feat = self.featurize(audio_clip=audio_clip, overwrite=overwrite) - feat_squared = np.square(feat) - count = float(feat.shape[0]) - dim = feat.shape[1] - - for iter_index in range(len(samples) - 1): - next_feat = self.featurize(audio_clip=audio_paths_iter.next(), overwrite=overwrite) - next_feat_squared = np.square(next_feat) - feat_vertically_stacked = np.concatenate((feat, next_feat)).reshape(-1, dim) - feat = np.sum(feat_vertically_stacked, axis=0, keepdims=True) - feat_squared_vertically_stacked = np.concatenate((feat_squared, next_feat_squared)).reshape(-1, dim) - feat_squared = np.sum(feat_squared_vertically_stacked, axis=0, keepdims=True) - count = count + float(next_feat.shape[0]) + audio_paths = samples + manager = Manager() + return_dict = manager.dict() + jobs = [] + for threadIndex in range(cpu_count()): + proc = Process(target=self.preprocess_sample_normalize, args=(threadIndex, audio_paths, overwrite, return_dict)) + jobs.append(proc) + proc.start() + for proc in jobs: + proc.join() + + feat = np.sum(np.vstack([item['feat'] for item in return_dict.values()]), axis=0) + count = sum([item['count'] for item in return_dict.values()]) + feat_squared = np.sum(np.vstack([item['feat_squared'] for item in return_dict.values()]), axis=0) + self.feats_mean = feat / float(count) self.feats_std = np.sqrt(feat_squared / float(count) - np.square(self.feats_mean)) - np.savetxt(generate_file_path(self.save_dir, self.model_name, 'feats_mean'), self.feats_mean) - np.savetxt(generate_file_path(self.save_dir, self.model_name, 'feats_std'), self.feats_std) + np.savetxt( + generate_file_path(self.save_dir, self.model_name, 'feats_mean'), self.feats_mean) + np.savetxt( + generate_file_path(self.save_dir, self.model_name, 'feats_std'), self.feats_std) + log.info("End calculating mean and std from samples") diff --git a/example/speech_recognition/stt_io_iter.py b/example/speech_recognition/stt_io_iter.py index 70c31ce92dde..5ae65191c840 100644 --- a/example/speech_recognition/stt_io_iter.py +++ b/example/speech_recognition/stt_io_iter.py @@ -31,7 +31,8 @@ def provide_label(self): class STTIter(mx.io.DataIter): def __init__(self, count, datagen, batch_size, num_label, init_states, seq_length, width, height, sort_by_duration=True, - is_bi_graphemes=False, partition="train",): + is_bi_graphemes=False, partition="train", + save_feature_as_csvfile=False): super(STTIter, self).__init__() self.batch_size = batch_size self.num_label = num_label @@ -75,6 +76,7 @@ def __init__(self, count, datagen, batch_size, num_label, init_states, seq_lengt self.trainDataIter = iter(self.trainDataList) self.is_first_epoch = True + self.save_feature_as_csvfile = save_feature_as_csvfile def __iter__(self): init_state_names = [x[0] for x in self.init_states] @@ -92,9 +94,9 @@ def __iter__(self): audio_paths.append(audio_path) texts.append(text) if self.is_first_epoch: - data_set = self.datagen.prepare_minibatch(audio_paths, texts, overwrite=True, is_bi_graphemes=self.is_bi_graphemes) + data_set = self.datagen.prepare_minibatch(audio_paths, texts, overwrite=True, is_bi_graphemes=self.is_bi_graphemes, save_feature_as_csvfile=self.save_feature_as_csvfile) else: - data_set = self.datagen.prepare_minibatch(audio_paths, texts, overwrite=False, is_bi_graphemes=self.is_bi_graphemes) + data_set = self.datagen.prepare_minibatch(audio_paths, texts, overwrite=False, is_bi_graphemes=self.is_bi_graphemes, save_feature_as_csvfile=self.save_feature_as_csvfile) data_all = [mx.nd.array(data_set['x'])] + self.init_state_arrays label_all = [mx.nd.array(data_set['y'])] @@ -103,7 +105,6 @@ def __iter__(self): data_batch = SimpleBatch(data_names, data_all, label_names, label_all) yield data_batch - self.is_first_epoch = False def reset(self): pass diff --git a/example/speech_recognition/stt_layer_batchnorm.py b/example/speech_recognition/stt_layer_batchnorm.py index 86e75aa49557..5b73f4f9f890 100644 --- a/example/speech_recognition/stt_layer_batchnorm.py +++ b/example/speech_recognition/stt_layer_batchnorm.py @@ -6,7 +6,7 @@ def batchnorm(net, beta=None, eps=0.001, momentum=0.9, - fix_gamma=True, + fix_gamma=False, use_global_stats=False, output_mean_var=False, name=None): @@ -18,7 +18,8 @@ def batchnorm(net, momentum=momentum, fix_gamma=fix_gamma, use_global_stats=use_global_stats, - output_mean_var=output_mean_var + output_mean_var=output_mean_var, + name=name ) else: net = mx.sym.BatchNorm(data=net, @@ -26,6 +27,7 @@ def batchnorm(net, momentum=momentum, fix_gamma=fix_gamma, use_global_stats=use_global_stats, - output_mean_var=output_mean_var + output_mean_var=output_mean_var, + name=name ) return net diff --git a/example/speech_recognition/stt_layer_conv.py b/example/speech_recognition/stt_layer_conv.py index 5ec292557f04..ab0035e4803b 100644 --- a/example/speech_recognition/stt_layer_conv.py +++ b/example/speech_recognition/stt_layer_conv.py @@ -8,20 +8,22 @@ def conv(net, weight=None, bias=None, act_type="relu", - no_bias=False + no_bias=False, + name=None ): # 2d convolution's input should have the shape of 4D (batch_size,1,seq_len,feat_dim) if weight is None or bias is None: # ex) filter_dimension = (41,11) , stride=(2,2) - net = mx.sym.Convolution(data=net, num_filter=channels, kernel=filter_dimension, stride=stride, no_bias=no_bias) + net = mx.sym.Convolution(data=net, num_filter=channels, kernel=filter_dimension, stride=stride, no_bias=no_bias, + name=name) elif weight is None or bias is not None: net = mx.sym.Convolution(data=net, num_filter=channels, kernel=filter_dimension, stride=stride, bias=bias, - no_bias=no_bias) + no_bias=no_bias, name=name) elif weight is not None or bias is None: net = mx.sym.Convolution(data=net, num_filter=channels, kernel=filter_dimension, stride=stride, weight=weight, - no_bias=no_bias) + no_bias=no_bias, name=name) else: net = mx.sym.Convolution(data=net, num_filter=channels, kernel=filter_dimension, stride=stride, weight=weight, - bias=bias, no_bias=no_bias) + bias=bias, no_bias=no_bias, name=name) net = mx.sym.Activation(data=net, act_type=act_type) return net diff --git a/example/speech_recognition/stt_layer_fc.py b/example/speech_recognition/stt_layer_fc.py index b3db2034a3ad..f435922426c5 100644 --- a/example/speech_recognition/stt_layer_fc.py +++ b/example/speech_recognition/stt_layer_fc.py @@ -8,29 +8,30 @@ def fc(net, act_type, weight=None, bias=None, - no_bias=False + no_bias=False, + name=None ): # when weight and bias doesn't have specific name if weight is None and bias is None: - net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, no_bias=no_bias) + net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, no_bias=no_bias, name=name) # when weight doesn't have specific name but bias has elif weight is None and bias is not None: if no_bias: - net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, no_bias=no_bias) + net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, no_bias=no_bias, name=name) else: - net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, bias=bias, no_bias=no_bias) + net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, bias=bias, no_bias=no_bias, name=name) # when bias doesn't have specific name but weight has elif weight is not None and bias is None: - net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, no_bias=no_bias) + net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, no_bias=no_bias, name=name) # when weight and bias specific name else: if no_bias: - net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, no_bias=no_bias) + net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, no_bias=no_bias, name=name) else: - net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, bias=bias, no_bias=no_bias) + net = mx.sym.FullyConnected(data=net, num_hidden=num_hidden, weight=weight, bias=bias, no_bias=no_bias, name=name) # activation if act_type is not None: - net = mx.sym.Activation(data=net, act_type=act_type) + net = mx.sym.Activation(data=net, act_type=act_type, name="%s_activation" % name) return net @@ -41,7 +42,7 @@ def sequence_fc(net, num_hidden_list=[], act_type_list=[], is_batchnorm=False, - dropout_rate=0 + dropout_rate=0, ): if num_layer == len(num_hidden_list) == len(act_type_list): if num_layer > 0: @@ -81,13 +82,16 @@ def sequence_fc(net, num_hidden=num_hidden_list[layer_index], act_type=None, weight=weight_list[layer_index], - no_bias=is_batchnorm + no_bias=is_batchnorm, + name="%s_t%d_l%d_fc" % (prefix, seq_index, layer_index) ) # last layer doesn't have batchnorm hidden = batchnorm(net=hidden, gamma=gamma_list[layer_index], - beta=beta_list[layer_index]) - hidden = mx.sym.Activation(data=hidden, act_type=act_type_list[layer_index]) + beta=beta_list[layer_index], + name="%s_t%d_l%d_batchnorm" % (prefix, seq_index, layer_index)) + hidden = mx.sym.Activation(data=hidden, act_type=act_type_list[layer_index], + name="%s_t%d_l%d_activation" % (prefix, seq_index, layer_index)) else: hidden = fc(net=hidden, num_hidden=num_hidden_list[layer_index], diff --git a/example/speech_recognition/stt_layer_gru.py b/example/speech_recognition/stt_layer_gru.py index 8b044746dfcf..89af1c72216d 100644 --- a/example/speech_recognition/stt_layer_gru.py +++ b/example/speech_recognition/stt_layer_gru.py @@ -15,7 +15,7 @@ "param_blocks"]) -def gru(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., is_batchnorm=False, gamma=None, beta=None): +def gru(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., is_batchnorm=False, gamma=None, beta=None, name=None): """ GRU Cell symbol Reference: @@ -31,7 +31,10 @@ def gru(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., is_ name="t%d_l%d_gates_i2h" % (seqidx, layeridx)) if is_batchnorm: - i2h = batchnorm(net=i2h, gamma=gamma, beta=beta) + if name is not None: + i2h = batchnorm(net=i2h, gamma=gamma, beta=beta, name="%s_batchnorm" % name) + else: + i2h = batchnorm(net=i2h, gamma=gamma, beta=beta) h2h = mx.sym.FullyConnected(data=prev_state.h, weight=param.gates_h2h_weight, bias=param.gates_h2h_bias, @@ -53,15 +56,15 @@ def gru(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., is_ weight=param.trans_h2h_weight, bias=param.trans_h2h_bias, num_hidden=num_hidden, - name="t%d_l%d_trans_i2h" % (seqidx, layeridx)) + name="t%d_l%d_trans_h2h" % (seqidx, layeridx)) h_trans = htrans_i2h + htrans_h2h h_trans_active = mx.sym.Activation(h_trans, act_type="tanh") next_h = prev_state.h + update_gate * (h_trans_active - prev_state.h) return GRUState(h=next_h) -def gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_batchnorm=False, prefix="", - direction="forward"): +def gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_batchnorm=False, prefix="", + direction="forward", is_bucketing=False): if num_gru_layer > 0: param_cells = [] last_states = [] @@ -81,9 +84,14 @@ def gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_ if is_batchnorm: batchnorm_gamma = [] batchnorm_beta = [] - for seqidx in range(seq_len): - batchnorm_gamma.append(mx.sym.Variable(prefix + "t%d_i2h_gamma" % seqidx)) - batchnorm_beta.append(mx.sym.Variable(prefix + "t%d_i2h_beta" % seqidx)) + if is_bucketing: + for l in range(num_gru_layer): + batchnorm_gamma.append(mx.sym.Variable(prefix + "l%d_i2h_gamma" % l)) + batchnorm_beta.append(mx.sym.Variable(prefix + "l%d_i2h_beta" % l)) + else: + for seqidx in range(seq_len): + batchnorm_gamma.append(mx.sym.Variable(prefix + "t%d_i2h_gamma" % seqidx)) + batchnorm_beta.append(mx.sym.Variable(prefix + "t%d_i2h_beta" % seqidx)) hidden_all = [] for seqidx in range(seq_len): @@ -103,19 +111,33 @@ def gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_ else: dp_ratio = dropout if is_batchnorm: - next_state = gru(num_hidden_gru_list[i], indata=hidden, - prev_state=last_states[i], - param=param_cells[i], - seqidx=k, layeridx=i, dropout=dp_ratio, - is_batchnorm=is_batchnorm, - gamma=batchnorm_gamma[k], - beta=batchnorm_beta[k]) + if is_bucketing: + next_state = gru(num_hidden_gru_list[i], indata=hidden, + prev_state=last_states[i], + param=param_cells[i], + seqidx=k, layeridx=i, dropout=dp_ratio, + is_batchnorm=is_batchnorm, + gamma=batchnorm_gamma[i], + beta=batchnorm_beta[i], + name=prefix + ("t%d_l%d" % (seqidx, i)) + ) + else: + next_state = gru(num_hidden_gru_list[i], indata=hidden, + prev_state=last_states[i], + param=param_cells[i], + seqidx=k, layeridx=i, dropout=dp_ratio, + is_batchnorm=is_batchnorm, + gamma=batchnorm_gamma[k], + beta=batchnorm_beta[k], + name=prefix + ("t%d_l%d" % (seqidx, i)) + ) else: next_state = gru(num_hidden_gru_list[i], indata=hidden, prev_state=last_states[i], param=param_cells[i], seqidx=k, layeridx=i, dropout=dp_ratio, - is_batchnorm=is_batchnorm) + is_batchnorm=is_batchnorm, + name=prefix) hidden = next_state.h last_states[i] = next_state # decoder @@ -133,7 +155,7 @@ def gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_ return net -def bi_gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_batchnorm=False): +def bi_gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., is_batchnorm=False, is_bucketing=False): if num_gru_layer > 0: net_forward = gru_unroll(net=net, num_gru_layer=num_gru_layer, @@ -142,7 +164,8 @@ def bi_gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., dropout=dropout, is_batchnorm=is_batchnorm, prefix="forward_", - direction="forward") + direction="forward", + is_bucketing=is_bucketing) net_backward = gru_unroll(net=net, num_gru_layer=num_gru_layer, seq_len=seq_len, @@ -150,7 +173,8 @@ def bi_gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., dropout=dropout, is_batchnorm=is_batchnorm, prefix="backward_", - direction="backward") + direction="backward", + is_bucketing=is_bucketing) hidden_all = [] for i in range(seq_len): hidden_all.append(mx.sym.Concat(*[net_forward[i], net_backward[i]], dim=1)) @@ -159,7 +183,7 @@ def bi_gru_unroll(net, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., def bi_gru_unroll_two_input_two_output(net1, net2, num_gru_layer, seq_len, num_hidden_gru_list, dropout=0., - is_batchnorm=False): + is_batchnorm=False, is_bucketing=False): if num_gru_layer > 0: net_forward = gru_unroll(net=net1, num_gru_layer=num_gru_layer, @@ -168,7 +192,8 @@ def bi_gru_unroll_two_input_two_output(net1, net2, num_gru_layer, seq_len, num_h dropout=dropout, is_batchnorm=is_batchnorm, prefix="forward_", - direction="forward") + direction="forward", + is_bucketing=is_bucketing) net_backward = gru_unroll(net=net2, num_gru_layer=num_gru_layer, seq_len=seq_len, @@ -176,7 +201,8 @@ def bi_gru_unroll_two_input_two_output(net1, net2, num_gru_layer, seq_len, num_h dropout=dropout, is_batchnorm=is_batchnorm, prefix="backward_", - direction="backward") + direction="backward", + is_bucketing=is_bucketing) return net_forward, net_backward else: return net1, net2 diff --git a/example/speech_recognition/stt_layer_lstm.py b/example/speech_recognition/stt_layer_lstm.py index 19e37369b1b0..93b4ca09b908 100644 --- a/example/speech_recognition/stt_layer_lstm.py +++ b/example/speech_recognition/stt_layer_lstm.py @@ -16,7 +16,7 @@ "param_blocks"]) -def vanilla_lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, is_batchnorm=False, gamma=None, beta=None): +def vanilla_lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, is_batchnorm=False, gamma=None, beta=None, name=None): """LSTM Cell symbol""" i2h = mx.sym.FullyConnected(data=indata, weight=param.i2h_weight, @@ -24,7 +24,10 @@ def vanilla_lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, is_bat num_hidden=num_hidden * 4, name="t%d_l%d_i2h" % (seqidx, layeridx)) if is_batchnorm: - i2h = batchnorm(net=i2h, gamma=gamma, beta=beta) + if name is not None: + i2h = batchnorm(net=i2h, gamma=gamma, beta=beta, name="%s_batchnorm" % name) + else: + i2h = batchnorm(net=i2h, gamma=gamma, beta=beta) h2h = mx.sym.FullyConnected(data=prev_state.h, weight=param.h2h_weight, bias=param.h2h_bias, @@ -43,7 +46,7 @@ def vanilla_lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, is_bat def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., num_hidden_proj=0, is_batchnorm=False, - gamma=None, beta=None): + gamma=None, beta=None, name=None): """LSTM Cell symbol""" # dropout input if dropout > 0.: @@ -55,7 +58,10 @@ def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., nu num_hidden=num_hidden * 4, name="t%d_l%d_i2h" % (seqidx, layeridx)) if is_batchnorm: - i2h = batchnorm(net=i2h, gamma=gamma, beta=beta) + if name is not None: + i2h = batchnorm(net=i2h, gamma=gamma, beta=beta, name="%s_batchnorm" % name) + else: + i2h = batchnorm(net=i2h, gamma=gamma, beta=beta) h2h = mx.sym.FullyConnected(data=prev_state.h, weight=param.h2h_weight, @@ -96,7 +102,7 @@ def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., nu def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0., num_hidden_proj=0, - lstm_type='fc_lstm', is_batchnorm=False, prefix="", direction="forward"): + lstm_type='fc_lstm', is_batchnorm=False, prefix="", direction="forward", is_bucketing=False): if num_lstm_layer > 0: param_cells = [] last_states = [] @@ -121,9 +127,14 @@ def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0., if is_batchnorm: batchnorm_gamma = [] batchnorm_beta = [] - for seqidx in range(seq_len): - batchnorm_gamma.append(mx.sym.Variable(prefix + "t%d_i2h_gamma" % seqidx)) - batchnorm_beta.append(mx.sym.Variable(prefix + "t%d_i2h_beta" % seqidx)) + if is_bucketing: + for l in range(num_lstm_layer): + batchnorm_gamma.append(mx.sym.Variable(prefix + "l%d_i2h_gamma" % l)) + batchnorm_beta.append(mx.sym.Variable(prefix + "l%d_i2h_beta" % l)) + else: + for seqidx in range(seq_len): + batchnorm_gamma.append(mx.sym.Variable(prefix + "t%d_i2h_gamma" % seqidx)) + batchnorm_beta.append(mx.sym.Variable(prefix + "t%d_i2h_beta" % seqidx)) hidden_all = [] for seqidx in range(seq_len): @@ -145,18 +156,20 @@ def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0., if lstm_type == 'fc_lstm': if is_batchnorm: - next_state = lstm(num_hidden_lstm_list[i], - indata=hidden, - prev_state=last_states[i], - param=param_cells[i], - seqidx=k, - layeridx=i, - dropout=dp, - num_hidden_proj=num_hidden_proj, - is_batchnorm=is_batchnorm, - gamma=batchnorm_gamma[k], - beta=batchnorm_beta[k] - ) + if is_bucketing: + next_state = lstm(num_hidden_lstm_list[i], + indata=hidden, + prev_state=last_states[i], + param=param_cells[i], + seqidx=k, + layeridx=i, + dropout=dp, + num_hidden_proj=num_hidden_proj, + is_batchnorm=is_batchnorm, + gamma=batchnorm_gamma[i], + beta=batchnorm_beta[i], + name=prefix + ("t%d_l%d" % (seqidx, i)) + ) else: next_state = lstm(num_hidden_lstm_list[i], indata=hidden, @@ -166,7 +179,8 @@ def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0., layeridx=i, dropout=dp, num_hidden_proj=num_hidden_proj, - is_batchnorm=is_batchnorm + is_batchnorm=is_batchnorm, + name=prefix + ("t%d_l%d" % (seqidx, i)) ) elif lstm_type == 'vanilla_lstm': if is_batchnorm: @@ -175,15 +189,17 @@ def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0., param=param_cells[i], seqidx=k, layeridx=i, is_batchnorm=is_batchnorm, - gamma=batchnorm_gamma[k], - beta=batchnorm_beta[k] + gamma=batchnorm_gamma[i], + beta=batchnorm_beta[i], + name=prefix + ("t%d_l%d" % (seqidx, i)) ) else: next_state = vanilla_lstm(num_hidden_lstm_list[i], indata=hidden, prev_state=last_states[i], param=param_cells[i], seqidx=k, layeridx=i, - is_batchnorm=is_batchnorm + is_batchnorm=is_batchnorm, + name=prefix + ("t%d_l%d" % (seqidx, i)) ) else: raise Exception("lstm type %s error" % lstm_type) @@ -206,7 +222,7 @@ def lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0., def bi_lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0., num_hidden_proj=0, - lstm_type='fc_lstm', is_batchnorm=False): + lstm_type='fc_lstm', is_batchnorm=False, is_bucketing=False): if num_lstm_layer > 0: net_forward = lstm_unroll(net=net, num_lstm_layer=num_lstm_layer, @@ -217,7 +233,8 @@ def bi_lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0 lstm_type=lstm_type, is_batchnorm=is_batchnorm, prefix="forward_", - direction="forward") + direction="forward", + is_bucketing=is_bucketing) net_backward = lstm_unroll(net=net, num_lstm_layer=num_lstm_layer, @@ -228,7 +245,8 @@ def bi_lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0 lstm_type=lstm_type, is_batchnorm=is_batchnorm, prefix="backward_", - direction="backward") + direction="backward", + is_bucketing=is_bucketing) hidden_all = [] for i in range(seq_len): hidden_all.append(mx.sym.Concat(*[net_forward[i], net_backward[i]], dim=1)) @@ -239,7 +257,9 @@ def bi_lstm_unroll(net, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0 # bilistm_2to1 def bi_lstm_unroll_two_input_two_output(net1, net2, num_lstm_layer, seq_len, num_hidden_lstm_list, dropout=0., num_hidden_proj=0, - lstm_type='fc_lstm', is_batchnorm=False): + lstm_type='fc_lstm', + is_batchnorm=False, + is_bucketing=False): if num_lstm_layer > 0: net_forward = lstm_unroll(net=net1, num_lstm_layer=num_lstm_layer, @@ -250,7 +270,8 @@ def bi_lstm_unroll_two_input_two_output(net1, net2, num_lstm_layer, seq_len, num lstm_type=lstm_type, is_batchnorm=is_batchnorm, prefix="forward_", - direction="forward") + direction="forward", + is_bucketing=is_bucketing) net_backward = lstm_unroll(net=net2, num_lstm_layer=num_lstm_layer, @@ -261,7 +282,8 @@ def bi_lstm_unroll_two_input_two_output(net1, net2, num_lstm_layer, seq_len, num lstm_type=lstm_type, is_batchnorm=is_batchnorm, prefix="backward_", - direction="backward") + direction="backward", + is_bucketing=is_bucketing) return net_forward, net_backward else: return net1, net2 diff --git a/example/speech_recognition/stt_metric.py b/example/speech_recognition/stt_metric.py index 0fc2bd11d906..1c5f4408a60e 100644 --- a/example/speech_recognition/stt_metric.py +++ b/example/speech_recognition/stt_metric.py @@ -19,12 +19,11 @@ def check_label_shapes(labels, preds, shape=0): class STTMetric(mx.metric.EvalMetric): - def __init__(self, batch_size, num_gpu, seq_length, is_epoch_end=False, is_logging=True): + def __init__(self, batch_size, num_gpu, is_epoch_end=False, is_logging=True): super(STTMetric, self).__init__('STTMetric') self.batch_size = batch_size self.num_gpu = num_gpu - self.seq_length = seq_length self.total_n_label = 0 self.total_l_dist = 0 self.is_epoch_end = is_epoch_end @@ -37,15 +36,17 @@ def update(self, labels, preds): log = LogUtil().getlogger() labelUtil = LabelUtil.getInstance() self.batch_loss = 0. + for label, pred in zip(labels, preds): label = label.asnumpy() pred = pred.asnumpy() - for i in range(int(int(self.batch_size) / int(self.num_gpu))): + seq_length = len(pred) / int(int(self.batch_size) / int(self.num_gpu)) + for i in range(int(int(self.batch_size) / int(self.num_gpu))): l = remove_blank(label[i]) p = [] - for k in range(int(self.seq_length)): + for k in range(int(seq_length)): p.append(np.argmax(pred[k * int(int(self.batch_size) / int(self.num_gpu)) + i])) p = pred_best(p) @@ -60,7 +61,7 @@ def update(self, labels, preds): self.num_inst += 1 self.sum_metric += this_cer if self.is_epoch_end: - loss = ctc_loss(l, pred, i, int(self.seq_length), int(self.batch_size), int(self.num_gpu)) + loss = ctc_loss(l, pred, i, int(seq_length), int(self.batch_size), int(self.num_gpu)) self.batch_loss += loss if self.is_logging: log.info("loss: %f " % loss) diff --git a/example/speech_recognition/stt_utils.py b/example/speech_recognition/stt_utils.py index 6a32f0e57c2d..3c7ffce0f980 100644 --- a/example/speech_recognition/stt_utils.py +++ b/example/speech_recognition/stt_utils.py @@ -92,7 +92,7 @@ def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128): def spectrogram_from_file(filename, step=10, window=20, max_freq=None, - eps=1e-14, overwrite=False): + eps=1e-14, overwrite=False, save_feature_as_csvfile=False): """ Calculate the log of linear spectrogram from FFT energy Params: filename (str): Path to the audio file @@ -104,7 +104,7 @@ def spectrogram_from_file(filename, step=10, window=20, max_freq=None, """ csvfilename = filename.replace(".wav", ".csv") - if (os.path.isfile(csvfilename) is False) or overwrite: + if (os.path.isfile(csvfilename) is False) or overwrite: with soundfile.SoundFile(filename) as sound_file: audio = sound_file.read(dtype='float32') sample_rate = sound_file.samplerate @@ -126,7 +126,8 @@ def spectrogram_from_file(filename, step=10, window=20, max_freq=None, ind = np.where(freqs <= max_freq)[0][-1] + 1 res = np.transpose(np.log(pxx[:ind, :] + eps)) - np.savetxt(csvfilename, res) + if save_feature_as_csvfile: + np.savetxt(csvfilename, res) return res else: return np.loadtxt(csvfilename) diff --git a/example/speech_recognition/train.py b/example/speech_recognition/train.py index 37f00fc4dd90..f3a7555529e3 100644 --- a/example/speech_recognition/train.py +++ b/example/speech_recognition/train.py @@ -7,7 +7,9 @@ from stt_metric import STTMetric #tensorboard setting from tensorboard import SummaryWriter -import numpy as np +import json +from stt_bucketing_module import STTBucketingModule + def get_initializer(args): @@ -28,6 +30,7 @@ def __init__(self, learning_rate=0.001): def __call__(self, num_update): return self.learning_rate + def do_training(args, module, data_train, data_val, begin_epoch=0): from distutils.dir_util import mkpath from log_util import LogUtil @@ -35,7 +38,7 @@ def do_training(args, module, data_train, data_val, begin_epoch=0): log = LogUtil().getlogger() mkpath(os.path.dirname(get_checkpoint_path(args))) - seq_len = args.config.get('arch', 'max_t_count') + #seq_len = args.config.get('arch', 'max_t_count') batch_size = args.config.getint('common', 'batch_size') save_checkpoint_every_n_epoch = args.config.getint('common', 'save_checkpoint_every_n_epoch') save_checkpoint_every_n_batch = args.config.getint('common', 'save_checkpoint_every_n_batch') @@ -44,27 +47,48 @@ def do_training(args, module, data_train, data_val, begin_epoch=0): contexts = parse_contexts(args) num_gpu = len(contexts) - eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=seq_len,is_logging=enable_logging_validation_metric,is_epoch_end=True) + eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_validation_metric,is_epoch_end=True) # tensorboard setting - loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=seq_len,is_logging=enable_logging_train_metric,is_epoch_end=False) + loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_train_metric,is_epoch_end=False) - optimizer = args.config.get('train', 'optimizer') - momentum = args.config.getfloat('train', 'momentum') + optimizer = args.config.get('optimizer', 'optimizer') learning_rate = args.config.getfloat('train', 'learning_rate') learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing') mode = args.config.get('common', 'mode') num_epoch = args.config.getint('train', 'num_epoch') - clip_gradient = args.config.getfloat('train', 'clip_gradient') - weight_decay = args.config.getfloat('train', 'weight_decay') + clip_gradient = args.config.getfloat('optimizer', 'clip_gradient') + weight_decay = args.config.getfloat('optimizer', 'weight_decay') save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states') show_every = args.config.getint('train', 'show_every') + optimizer_params_dictionary = json.loads(args.config.get('optimizer', 'optimizer_params_dictionary')) + kvstore_option = args.config.get('common', 'kvstore_option') n_epoch=begin_epoch + is_bucketing = args.config.getboolean('arch', 'is_bucketing') if clip_gradient == 0: clip_gradient = None + if is_bucketing and mode == 'load': + model_file = args.config.get('common', 'model_file') + model_name = os.path.splitext(model_file)[0] + model_num_epoch = int(model_name[-4:]) + + model_path = 'checkpoints/' + str(model_name[:-5]) + symbol, data_names, label_names = module(1600) + model = STTBucketingModule( + sym_gen=module, + default_bucket_key=data_train.default_bucket_key, + context=contexts) + data_train.reset() - module.bind(data_shapes=data_train.provide_data, + model.bind(data_shapes=data_train.provide_data, + label_shapes=data_train.provide_label, + for_training=True) + _, arg_params, aux_params = mx.model.load_checkpoint(model_path, model_num_epoch) + model.set_params(arg_params, aux_params) + module = model + else: + module.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) @@ -75,41 +99,32 @@ def do_training(args, module, data_train, data_val, begin_epoch=0): lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate) def reset_optimizer(force_init=False): - if optimizer == "sgd": - module.init_optimizer(kvstore='device', - optimizer=optimizer, - optimizer_params={'lr_scheduler': lr_scheduler, - 'momentum': momentum, - 'clip_gradient': clip_gradient, - 'wd': weight_decay}, - force_init=force_init) - elif optimizer == "adam": - module.init_optimizer(kvstore='device', - optimizer=optimizer, - optimizer_params={'lr_scheduler': lr_scheduler, - #'momentum': momentum, - 'clip_gradient': clip_gradient, - 'wd': weight_decay}, - force_init=force_init) - else: - raise Exception('Supported optimizers are sgd and adam. If you want to implement others define them in train.py') + optimizer_params = {'lr_scheduler': lr_scheduler, + 'clip_gradient': clip_gradient, + 'wd': weight_decay} + optimizer_params.update(optimizer_params_dictionary) + module.init_optimizer(kvstore=kvstore_option, + optimizer=optimizer, + optimizer_params=optimizer_params, + force_init=force_init) if mode == "train": reset_optimizer(force_init=True) else: reset_optimizer(force_init=False) + data_train.reset() + data_train.is_first_epoch = True #tensorboard setting tblog_dir = args.config.get('common', 'tensorboard_log_dir') summary_writer = SummaryWriter(tblog_dir) + while True: if n_epoch >= num_epoch: break - loss_metric.reset() log.info('---------train---------') for nbatch, data_batch in enumerate(data_train): - module.forward_backward(data_batch) module.update() # tensorboard setting @@ -136,6 +151,7 @@ def reset_optimizer(force_init=False): assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric' data_train.reset() + data_train.is_first_epoch = False # tensorboard setting train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value()